LLVM 20.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
82#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157/// Value type used for condition codes.
158static const MVT MVT_CC = MVT::i32;
159
160static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
163static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
166
168
170
171static inline EVT getPackedSVEVectorVT(EVT VT) {
172 switch (VT.getSimpleVT().SimpleTy) {
173 default:
174 llvm_unreachable("unexpected element type for vector");
175 case MVT::i8:
176 return MVT::nxv16i8;
177 case MVT::i16:
178 return MVT::nxv8i16;
179 case MVT::i32:
180 return MVT::nxv4i32;
181 case MVT::i64:
182 return MVT::nxv2i64;
183 case MVT::f16:
184 return MVT::nxv8f16;
185 case MVT::f32:
186 return MVT::nxv4f32;
187 case MVT::f64:
188 return MVT::nxv2f64;
189 case MVT::bf16:
190 return MVT::nxv8bf16;
191 }
192}
193
194// NOTE: Currently there's only a need to return integer vector types. If this
195// changes then just add an extra "type" parameter.
197 switch (EC.getKnownMinValue()) {
198 default:
199 llvm_unreachable("unexpected element count for vector");
200 case 16:
201 return MVT::nxv16i8;
202 case 8:
203 return MVT::nxv8i16;
204 case 4:
205 return MVT::nxv4i32;
206 case 2:
207 return MVT::nxv2i64;
208 }
209}
210
212 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213 "Expected scalable predicate vector type!");
214 switch (VT.getVectorMinNumElements()) {
215 default:
216 llvm_unreachable("unexpected element count for vector");
217 case 2:
218 return MVT::nxv2i64;
219 case 4:
220 return MVT::nxv4i32;
221 case 8:
222 return MVT::nxv8i16;
223 case 16:
224 return MVT::nxv16i8;
225 }
226}
227
228/// Returns true if VT's elements occupy the lowest bit positions of its
229/// associated register class without any intervening space.
230///
231/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232/// same register class, but only nxv8f16 can be treated as a packed vector.
233static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 "Expected legal vector type!");
236 return VT.isFixedLengthVector() ||
238}
239
240// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241// predicate and end with a passthru value matching the result type.
242static bool isMergePassthruOpcode(unsigned Opc) {
243 switch (Opc) {
244 default:
245 return false;
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
289 return true;
291 switch (Op.getConstantOperandVal(0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(1);
354 ConstDisc = Disc->getOperand(2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370
371 return std::make_tuple(
372 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373 AddrDisc);
374}
375
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
385
386 // Set up the register classes.
387 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
416
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440
441 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447
448 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
455 addRegisterClass(VT, &AArch64::ZPRRegClass);
456
459 addRegisterClass(VT, &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467
468 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470 }
471
472 // Compute derived properties from the register classes
474
475 // Provide all sorts of operation actions
515
517
521
525
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
532
533 // Virtually no operation on f128 is legal, but LLVM can't expand them when
534 // there's a valid register class, so we need custom operations in most cases.
559 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560 // aren't handled.
561
562 // Lowering for many of the conversions is actually specified by the non-f128
563 // type. The LowerXXX function will be trivial when f128 isn't involved.
588 if (Subtarget->hasFPARMv8()) {
591 }
594 if (Subtarget->hasFPARMv8()) {
597 }
600
605
606 // Variable arguments.
611
612 // Variable-sized objects.
615
616 // Lowering Funnel Shifts to EXTR
621
623
624 // Constant pool entries
626
627 // BlockAddress
629
630 // AArch64 lacks both left-rotate and popcount instructions.
636 }
637
638 // AArch64 doesn't have i32 MULH{S|U}.
641
642 // AArch64 doesn't have {U|S}MUL_LOHI.
647
648 if (Subtarget->hasCSSC()) {
652
654
658
661
666
671 } else {
675
678
681 }
682
688 }
695
696 // Custom lower Add/Sub/Mul with overflow.
709
718
727 if (Subtarget->hasFullFP16()) {
730 } else {
733 }
734
735 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
748 setOperationAction(Op, MVT::f16, Promote);
749 setOperationAction(Op, MVT::v4f16, Expand);
750 setOperationAction(Op, MVT::v8f16, Expand);
751 setOperationAction(Op, MVT::bf16, Promote);
752 setOperationAction(Op, MVT::v4bf16, Expand);
753 setOperationAction(Op, MVT::v8bf16, Expand);
754 }
755
756 // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
763
764 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
765 for (auto Op : {
769 ISD::FADD,
770 ISD::FSUB,
771 ISD::FMUL,
772 ISD::FDIV,
773 ISD::FMA,
804 })
805 setOperationAction(Op, ScalarVT, Promote);
806
807 for (auto Op : {ISD::FNEG, ISD::FABS})
808 setOperationAction(Op, ScalarVT, Legal);
809
810 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
811 // because the result type is integer.
815 setOperationAction(Op, ScalarVT, Custom);
816
817 // promote v4f16 to v4f32 when that is known to be safe.
818 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
819 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
820 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
821 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
822 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
823 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
824 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
825 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
826 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
827 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
828 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
829 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
830 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
831
841
842 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
864 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
865 };
866
867 if (!Subtarget->hasFullFP16()) {
868 LegalizeNarrowFP(MVT::f16);
869 }
870 LegalizeNarrowFP(MVT::bf16);
873
874 // AArch64 has implementations of a lot of rounding-like FP operations.
875 // clang-format off
876 for (auto Op :
888 for (MVT Ty : {MVT::f32, MVT::f64})
890 if (Subtarget->hasFullFP16())
891 setOperationAction(Op, MVT::f16, Legal);
892 }
893 // clang-format on
894
895 // Basic strict FP operations are legal
898 for (MVT Ty : {MVT::f32, MVT::f64})
900 if (Subtarget->hasFullFP16())
901 setOperationAction(Op, MVT::f16, Legal);
902 }
903
905
911
913 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
916 } else {
919 }
922
923 // Generate outline atomics library calls only if LSE was not specified for
924 // subtarget
925 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
951#define LCALLNAMES(A, B, N) \
952 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
953 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
954 setLibcallName(A##N##_REL, #B #N "_rel"); \
955 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
956#define LCALLNAME4(A, B) \
957 LCALLNAMES(A, B, 1) \
958 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
959#define LCALLNAME5(A, B) \
960 LCALLNAMES(A, B, 1) \
961 LCALLNAMES(A, B, 2) \
962 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
963 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
964 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
965 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
966 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
967 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
968 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
969#undef LCALLNAMES
970#undef LCALLNAME4
971#undef LCALLNAME5
972 }
973
974 if (Subtarget->hasLSE128()) {
975 // Custom lowering because i128 is not legal. Must be replaced by 2x64
976 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
980 }
981
982 // 128-bit loads and stores can be done without expanding
985
986 // Aligned 128-bit loads and stores are single-copy atomic according to the
987 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
988 if (Subtarget->hasLSE2()) {
991 }
992
993 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
994 // custom lowering, as there are no un-paired non-temporal stores and
995 // legalization will break up 256 bit inputs.
997 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
998 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
999 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1000 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1001 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1002 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1003 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1004
1005 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1006 // custom lowering, as there are no un-paired non-temporal loads legalization
1007 // will break up 256 bit inputs.
1008 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1009 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1010 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1011 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1012 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1013 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1014 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1015 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1016
1017 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1019
1020 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1021 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1022 // Issue __sincos_stret if available.
1025 } else {
1028 }
1029
1030 // Make floating-point constants legal for the large code model, so they don't
1031 // become loads from the constant pool.
1032 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1035 }
1036
1037 // AArch64 does not have floating-point extending loads, i1 sign-extending
1038 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1039 for (MVT VT : MVT::fp_valuetypes()) {
1040 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1041 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1042 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1043 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1044 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1045 }
1046 for (MVT VT : MVT::integer_valuetypes())
1047 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1048
1049 for (MVT WideVT : MVT::fp_valuetypes()) {
1050 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1051 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1052 setTruncStoreAction(WideVT, NarrowVT, Expand);
1053 }
1054 }
1055 }
1056
1057 if (Subtarget->hasFPARMv8()) {
1061 }
1062
1063 // Indexed loads and stores are supported.
1064 for (unsigned im = (unsigned)ISD::PRE_INC;
1066 setIndexedLoadAction(im, MVT::i8, Legal);
1067 setIndexedLoadAction(im, MVT::i16, Legal);
1068 setIndexedLoadAction(im, MVT::i32, Legal);
1069 setIndexedLoadAction(im, MVT::i64, Legal);
1070 setIndexedLoadAction(im, MVT::f64, Legal);
1071 setIndexedLoadAction(im, MVT::f32, Legal);
1072 setIndexedLoadAction(im, MVT::f16, Legal);
1073 setIndexedLoadAction(im, MVT::bf16, Legal);
1074 setIndexedStoreAction(im, MVT::i8, Legal);
1075 setIndexedStoreAction(im, MVT::i16, Legal);
1076 setIndexedStoreAction(im, MVT::i32, Legal);
1077 setIndexedStoreAction(im, MVT::i64, Legal);
1078 setIndexedStoreAction(im, MVT::f64, Legal);
1079 setIndexedStoreAction(im, MVT::f32, Legal);
1080 setIndexedStoreAction(im, MVT::f16, Legal);
1081 setIndexedStoreAction(im, MVT::bf16, Legal);
1082 }
1083
1084 // Trap.
1085 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1088
1089 // We combine OR nodes for bitfield operations.
1091 // Try to create BICs for vector ANDs.
1093
1094 // llvm.init.trampoline and llvm.adjust.trampoline
1097
1098 // Vector add and sub nodes may conceal a high-half opportunity.
1099 // Also, try to fold ADD into CSINC/CSINV..
1102
1105
1106 // Try and combine setcc with csel
1108
1110
1117
1119
1121
1123
1127
1130
1132
1134
1136
1140
1142
1143 // In case of strict alignment, avoid an excessive number of byte wide stores.
1146 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1147
1151 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1152
1155 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1156
1159 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1160
1162
1164
1165 EnableExtLdPromotion = true;
1166
1167 // Set required alignment.
1169 // Set preferred alignments.
1170
1171 // Don't align loops on Windows. The SEH unwind info generation needs to
1172 // know the exact length of functions before the alignments have been
1173 // expanded.
1174 if (!Subtarget->isTargetWindows())
1178
1179 // Only change the limit for entries in a jump table if specified by
1180 // the sub target, but not at the command line.
1181 unsigned MaxJT = STI.getMaximumJumpTableSize();
1182 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1184
1186
1188
1190 if (Subtarget->hasSME())
1192
1193 if (Subtarget->isNeonAvailable()) {
1194 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1195 // silliness like this:
1196 // clang-format off
1197 for (auto Op :
1218 setOperationAction(Op, MVT::v1f64, Expand);
1219 // clang-format on
1220
1221 for (auto Op :
1226 setOperationAction(Op, MVT::v1i64, Expand);
1227
1228 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1229 // elements smaller than i32, so promote the input to i32 first.
1230 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1231 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1232
1233 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1234 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1235 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1238 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1240
1241 if (Subtarget->hasFullFP16()) {
1244
1253 } else {
1254 // when AArch64 doesn't have fullfp16 support, promote the input
1255 // to i32 first.
1256 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1257 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1258 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1259 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1260 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1261 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1262 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1263 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1264 }
1265
1266 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1267 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1274 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1279 }
1280
1281 // Custom handling for some quad-vector types to detect MULL.
1282 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1283 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1284 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1285 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1286 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1287 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1288
1289 // Saturates
1290 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1291 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1296 }
1297
1298 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1299 MVT::v4i32}) {
1306 }
1307
1308 // Vector reductions
1309 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1310 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1311 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1316
1318 }
1319 }
1320 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1321 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1330 }
1335
1337 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1338 // Likewise, narrowing and extending vector loads/stores aren't handled
1339 // directly.
1342
1343 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1346 } else {
1349 }
1352
1355
1356 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1357 setTruncStoreAction(VT, InnerVT, Expand);
1358 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1359 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1360 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1361 }
1362 }
1363
1364 for (auto Op :
1370 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1372 if (Subtarget->hasFullFP16())
1373 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1375 }
1376
1377 // LRINT and LLRINT.
1378 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1379 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1381 if (Subtarget->hasFullFP16())
1382 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1384 }
1385
1386 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1387
1392
1396
1397 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1398 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1399 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1400 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1401 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1402 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1403
1404 // ADDP custom lowering
1405 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1407 // FADDP custom lowering
1408 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1410 } else /* !isNeonAvailable */ {
1412 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1414
1415 if (VT.is128BitVector() || VT.is64BitVector()) {
1419 Subtarget->isLittleEndian() ? Legal : Expand);
1420 }
1421 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1422 setTruncStoreAction(VT, InnerVT, Expand);
1423 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1424 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1425 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1426 }
1427 }
1428 }
1429
1430 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1434 }
1435
1436 if (Subtarget->hasSME()) {
1438 }
1439
1440 // FIXME: Move lowering for more nodes here if those are common between
1441 // SVE and SME.
1442 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1443 for (auto VT :
1444 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1449 }
1450 }
1451
1452 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1453 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1494
1500
1509
1514
1515 if (!Subtarget->isLittleEndian())
1517
1518 if (Subtarget->hasSVE2() ||
1519 (Subtarget->hasSME() && Subtarget->isStreaming()))
1520 // For SLI/SRI.
1522 }
1523
1524 // Illegal unpacked integer vector types.
1525 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1528 }
1529
1530 // Type legalize unpacked bitcasts.
1531 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1533
1534 for (auto VT :
1535 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1536 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1538
1539 for (auto VT :
1540 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1548
1552
1553 // There are no legal MVT::nxv16f## based types.
1554 if (VT != MVT::nxv16i1) {
1557 }
1558 }
1559
1560 // NEON doesn't support masked loads/stores, but SME and SVE do.
1561 for (auto VT :
1562 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1563 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1564 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1567 }
1568
1569 // Firstly, exclude all scalable vector extending loads/truncating stores,
1570 // include both integer and floating scalable vector.
1572 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1573 setTruncStoreAction(VT, InnerVT, Expand);
1574 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1575 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1576 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1577 }
1578 }
1579
1580 // Then, selectively enable those which we directly support.
1581 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1582 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1583 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1584 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1585 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1586 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1587 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1588 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1589 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1590 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1591 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1592 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1593 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1594 }
1595
1596 // SVE supports truncating stores of 64 and 128-bit vectors
1597 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1598 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1599 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1600 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1601 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1602
1603 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1604 MVT::nxv4f32, MVT::nxv2f64}) {
1644
1666
1678 }
1679
1680 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1691
1692 if (Subtarget->hasSVEB16B16()) {
1701 }
1702 }
1703
1704 for (auto Opcode :
1707 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1708 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1709 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1710 }
1711
1712 if (!Subtarget->hasSVEB16B16()) {
1713 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1715 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1716 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1717 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1718 }
1719 }
1720
1723
1724 // NEON doesn't support integer divides, but SVE does
1725 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1726 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1729 }
1730
1731 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1732 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1733 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1734
1735 // NOTE: Currently this has to happen after computeRegisterProperties rather
1736 // than the preferred option of combining it with the addRegisterClass call.
1737 if (Subtarget->useSVEForFixedLengthVectors()) {
1740 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1741 addTypeForFixedLengthSVE(VT);
1742 }
1745 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1746 addTypeForFixedLengthSVE(VT);
1747 }
1748
1749 // 64bit results can mean a bigger than NEON input.
1750 for (auto VT : {MVT::v8i8, MVT::v4i16})
1753
1754 // 128bit results imply a bigger than NEON input.
1755 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1757 for (auto VT : {MVT::v8f16, MVT::v4f32})
1759
1760 // These operations are not supported on NEON but SVE can do them.
1762 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1763 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1764 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1765 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1766 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1767 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1768 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1769 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1770 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1771 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1772 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1773 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1774 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1775 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1776 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1781
1782 // Int operations with no NEON support.
1783 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1784 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1792 }
1793
1794 // Use SVE for vectors with more than 2 elements.
1795 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1797 }
1798
1799 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1800 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1801 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1802 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1803
1805
1806 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1808 }
1809
1810 // Handle operations that are only available in non-streaming SVE mode.
1811 if (Subtarget->isSVEAvailable()) {
1812 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1813 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1814 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1815 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1816 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1817 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1818 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1821 }
1822
1823 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1824 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1825 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1827
1828 // We can lower types that have <vscale x {2|4}> elements to compact.
1829 for (auto VT :
1830 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1831 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1833
1834 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1835 // NEON vectors in the lowest bits of the SVE register.
1836 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1837 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1839
1840 // Histcnt is SVE2 only
1841 if (Subtarget->hasSVE2()) {
1843 Custom);
1845 Custom);
1846 }
1847 }
1848
1849
1850 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1851 // Only required for llvm.aarch64.mops.memset.tag
1853 }
1854
1856
1857 if (Subtarget->hasSVE()) {
1862 }
1863
1864 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1865
1866 IsStrictFPEnabled = true;
1868
1869 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1870 // it, but it's just a wrapper around ldexp.
1871 if (Subtarget->isTargetWindows()) {
1873 if (isOperationExpand(Op, MVT::f32))
1874 setOperationAction(Op, MVT::f32, Promote);
1875 }
1876
1877 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1878 // isn't legal.
1880 if (isOperationExpand(Op, MVT::f16))
1881 setOperationAction(Op, MVT::f16, Promote);
1882
1883 if (Subtarget->isWindowsArm64EC()) {
1884 // FIXME: are there intrinsics we need to exclude from this?
1885 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1886 auto code = static_cast<RTLIB::Libcall>(i);
1887 auto libcallName = getLibcallName(code);
1888 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1889 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1890 }
1891 }
1892 }
1893}
1894
1895void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1896 assert(VT.isVector() && "VT should be a vector type");
1897
1898 if (VT.isFloatingPoint()) {
1900 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1901 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1902 }
1903
1904 // Mark vector float intrinsics as expand.
1905 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1923 }
1924
1925 // But we do support custom-lowering for FCOPYSIGN.
1926 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1927 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1928 VT == MVT::v8f16) &&
1929 Subtarget->hasFullFP16()))
1931
1944
1948 for (MVT InnerVT : MVT::all_valuetypes())
1949 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1950
1951 // CNT supports only B element sizes, then use UADDLP to widen.
1952 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1954
1960
1961 for (unsigned Opcode :
1964 setOperationAction(Opcode, VT, Custom);
1965
1966 if (!VT.isFloatingPoint())
1968
1969 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1970 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1971 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1972 setOperationAction(Opcode, VT, Legal);
1973
1974 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1975 // NEON types.
1976 if (VT.isFloatingPoint() &&
1977 VT.getVectorElementType() != MVT::bf16 &&
1978 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1979 for (unsigned Opcode :
1985 setOperationAction(Opcode, VT, Legal);
1986
1987 // Strict fp extend and trunc are legal
1988 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1990 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1992
1993 // FIXME: We could potentially make use of the vector comparison instructions
1994 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1995 // complications:
1996 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1997 // so we would need to expand when the condition code doesn't match the
1998 // kind of comparison.
1999 // * Some kinds of comparison require more than one FCMXY instruction so
2000 // would need to be expanded instead.
2001 // * The lowering of the non-strict versions involves target-specific ISD
2002 // nodes so we would likely need to add strict versions of all of them and
2003 // handle them appropriately.
2006
2007 if (Subtarget->isLittleEndian()) {
2008 for (unsigned im = (unsigned)ISD::PRE_INC;
2012 }
2013 }
2014
2015 if (Subtarget->hasD128()) {
2018 }
2019}
2020
2022 EVT OpVT) const {
2023 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2024 if (!Subtarget->hasSVE())
2025 return true;
2026
2027 // We can only support legal predicate result types. We can use the SVE
2028 // whilelo instruction for generating fixed-width predicates too.
2029 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
2030 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
2031 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
2032 return true;
2033
2034 // The whilelo instruction only works with i32 or i64 scalar inputs.
2035 if (OpVT != MVT::i32 && OpVT != MVT::i64)
2036 return true;
2037
2038 return false;
2039}
2040
2042 const IntrinsicInst *I) const {
2043 if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
2044 return true;
2045
2046 EVT VT = EVT::getEVT(I->getType());
2047 auto Op1 = I->getOperand(1);
2048 EVT Op1VT = EVT::getEVT(Op1->getType());
2049 if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
2050 (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
2051 VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
2052 return false;
2053 return true;
2054}
2055
2057 if (!Subtarget->isSVEorStreamingSVEAvailable())
2058 return true;
2059
2060 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2061 // also support fixed-width predicates.
2062 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2063 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2064 VT != MVT::v4i1 && VT != MVT::v2i1;
2065}
2066
2068 unsigned SearchSize) const {
2069 // MATCH is SVE2 and only available in non-streaming mode.
2070 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2071 return true;
2072 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2073 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2074 return SearchSize != 8;
2075 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2076 return SearchSize != 8 && SearchSize != 16;
2077 return true;
2078}
2079
2080void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2081 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2082
2083 // By default everything must be expanded.
2084 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2086
2087 if (VT.isFloatingPoint()) {
2097 }
2098
2100 VT == MVT::v1f64 ? Expand : Custom;
2101
2102 // Mark integer truncating stores/extending loads as having custom lowering
2103 if (VT.isInteger()) {
2104 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2105 while (InnerVT != VT) {
2106 setTruncStoreAction(VT, InnerVT, Default);
2107 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2108 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2109 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2110 InnerVT = InnerVT.changeVectorElementType(
2111 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2112 }
2113 }
2114
2115 // Mark floating-point truncating stores/extending loads as having custom
2116 // lowering
2117 if (VT.isFloatingPoint()) {
2118 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2119 while (InnerVT != VT) {
2120 setTruncStoreAction(VT, InnerVT, Custom);
2121 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2122 InnerVT = InnerVT.changeVectorElementType(
2124 }
2125 }
2126
2127 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2128 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2129
2130 // Lower fixed length vector operations to scalable equivalents.
2137 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2174 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2175 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2177 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2196 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2222}
2223
2224void AArch64TargetLowering::addDRType(MVT VT) {
2225 addRegisterClass(VT, &AArch64::FPR64RegClass);
2226 if (Subtarget->isNeonAvailable())
2227 addTypeForNEON(VT);
2228}
2229
2230void AArch64TargetLowering::addQRType(MVT VT) {
2231 addRegisterClass(VT, &AArch64::FPR128RegClass);
2232 if (Subtarget->isNeonAvailable())
2233 addTypeForNEON(VT);
2234}
2235
2237 LLVMContext &C, EVT VT) const {
2238 if (!VT.isVector())
2239 return MVT::i32;
2240 if (VT.isScalableVector())
2241 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2243}
2244
2245// isIntImmediate - This method tests to see if the node is a constant
2246// operand. If so Imm will receive the value.
2247static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2248 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2249 Imm = C->getZExtValue();
2250 return true;
2251 }
2252 return false;
2253}
2254
2255// isOpcWithIntImmediate - This method tests to see if the node is a specific
2256// opcode and that it has a immediate integer right operand.
2257// If so Imm will receive the value.
2258static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2259 uint64_t &Imm) {
2260 return N->getOpcode() == Opc &&
2261 isIntImmediate(N->getOperand(1).getNode(), Imm);
2262}
2263
2264static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2265 const APInt &Demanded,
2267 unsigned NewOpc) {
2268 uint64_t OldImm = Imm, NewImm, Enc;
2269 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2270
2271 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2272 // bimm64.
2273 if (Imm == 0 || Imm == Mask ||
2275 return false;
2276
2277 unsigned EltSize = Size;
2278 uint64_t DemandedBits = Demanded.getZExtValue();
2279
2280 // Clear bits that are not demanded.
2281 Imm &= DemandedBits;
2282
2283 while (true) {
2284 // The goal here is to set the non-demanded bits in a way that minimizes
2285 // the number of switching between 0 and 1. In order to achieve this goal,
2286 // we set the non-demanded bits to the value of the preceding demanded bits.
2287 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2288 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2289 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2290 // The final result is 0b11000011.
2291 uint64_t NonDemandedBits = ~DemandedBits;
2292 uint64_t InvertedImm = ~Imm & DemandedBits;
2293 uint64_t RotatedImm =
2294 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2295 NonDemandedBits;
2296 uint64_t Sum = RotatedImm + NonDemandedBits;
2297 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2298 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2299 NewImm = (Imm | Ones) & Mask;
2300
2301 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2302 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2303 // we halve the element size and continue the search.
2304 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2305 break;
2306
2307 // We cannot shrink the element size any further if it is 2-bits.
2308 if (EltSize == 2)
2309 return false;
2310
2311 EltSize /= 2;
2312 Mask >>= EltSize;
2313 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2314
2315 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2316 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2317 return false;
2318
2319 // Merge the upper and lower halves of Imm and DemandedBits.
2320 Imm |= Hi;
2321 DemandedBits |= DemandedBitsHi;
2322 }
2323
2324 ++NumOptimizedImms;
2325
2326 // Replicate the element across the register width.
2327 while (EltSize < Size) {
2328 NewImm |= NewImm << EltSize;
2329 EltSize *= 2;
2330 }
2331
2332 (void)OldImm;
2333 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2334 "demanded bits should never be altered");
2335 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2336
2337 // Create the new constant immediate node.
2338 EVT VT = Op.getValueType();
2339 SDLoc DL(Op);
2340 SDValue New;
2341
2342 // If the new constant immediate is all-zeros or all-ones, let the target
2343 // independent DAG combine optimize this node.
2344 if (NewImm == 0 || NewImm == OrigMask) {
2345 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2346 TLO.DAG.getConstant(NewImm, DL, VT));
2347 // Otherwise, create a machine node so that target independent DAG combine
2348 // doesn't undo this optimization.
2349 } else {
2351 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2352 New = SDValue(
2353 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2354 }
2355
2356 return TLO.CombineTo(Op, New);
2357}
2358
2360 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2361 TargetLoweringOpt &TLO) const {
2362 // Delay this optimization to as late as possible.
2363 if (!TLO.LegalOps)
2364 return false;
2365
2367 return false;
2368
2369 EVT VT = Op.getValueType();
2370 if (VT.isVector())
2371 return false;
2372
2373 unsigned Size = VT.getSizeInBits();
2374
2375 if (Size != 32 && Size != 64)
2376 return false;
2377
2378 // Exit early if we demand all bits.
2379 if (DemandedBits.popcount() == Size)
2380 return false;
2381
2382 unsigned NewOpc;
2383 switch (Op.getOpcode()) {
2384 default:
2385 return false;
2386 case ISD::AND:
2387 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2388 break;
2389 case ISD::OR:
2390 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2391 break;
2392 case ISD::XOR:
2393 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2394 break;
2395 }
2396 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2397 if (!C)
2398 return false;
2399 uint64_t Imm = C->getZExtValue();
2400 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2401}
2402
2403/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2404/// Mask are known to be either zero or one and return them Known.
2406 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2407 const SelectionDAG &DAG, unsigned Depth) const {
2408 switch (Op.getOpcode()) {
2409 default:
2410 break;
2411 case AArch64ISD::DUP: {
2412 SDValue SrcOp = Op.getOperand(0);
2413 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2414 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2415 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2416 "Expected DUP implicit truncation");
2417 Known = Known.trunc(Op.getScalarValueSizeInBits());
2418 }
2419 break;
2420 }
2421 case AArch64ISD::CSEL: {
2422 KnownBits Known2;
2423 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2424 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2425 Known = Known.intersectWith(Known2);
2426 break;
2427 }
2428 case AArch64ISD::BICi: {
2429 // Compute the bit cleared value.
2430 APInt Mask =
2431 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2432 .trunc(Known.getBitWidth());
2433 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2434 Known &= KnownBits::makeConstant(Mask);
2435 break;
2436 }
2437 case AArch64ISD::VLSHR: {
2438 KnownBits Known2;
2439 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2440 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2441 Known = KnownBits::lshr(Known, Known2);
2442 break;
2443 }
2444 case AArch64ISD::VASHR: {
2445 KnownBits Known2;
2446 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2447 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2448 Known = KnownBits::ashr(Known, Known2);
2449 break;
2450 }
2451 case AArch64ISD::VSHL: {
2452 KnownBits Known2;
2453 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2454 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2455 Known = KnownBits::shl(Known, Known2);
2456 break;
2457 }
2458 case AArch64ISD::MOVI: {
2460 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2461 break;
2462 }
2464 case AArch64ISD::ADDlow: {
2465 if (!Subtarget->isTargetILP32())
2466 break;
2467 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2468 Known.Zero = APInt::getHighBitsSet(64, 32);
2469 break;
2470 }
2472 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2473 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2474 break;
2475 }
2477 Intrinsic::ID IntID =
2478 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2479 switch (IntID) {
2480 default: return;
2481 case Intrinsic::aarch64_ldaxr:
2482 case Intrinsic::aarch64_ldxr: {
2483 unsigned BitWidth = Known.getBitWidth();
2484 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2485 unsigned MemBits = VT.getScalarSizeInBits();
2486 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2487 return;
2488 }
2489 }
2490 break;
2491 }
2493 case ISD::INTRINSIC_VOID: {
2494 unsigned IntNo = Op.getConstantOperandVal(0);
2495 switch (IntNo) {
2496 default:
2497 break;
2498 case Intrinsic::aarch64_neon_uaddlv: {
2499 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2500 unsigned BitWidth = Known.getBitWidth();
2501 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2502 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2503 assert(BitWidth >= Bound && "Unexpected width!");
2505 Known.Zero |= Mask;
2506 }
2507 break;
2508 }
2509 case Intrinsic::aarch64_neon_umaxv:
2510 case Intrinsic::aarch64_neon_uminv: {
2511 // Figure out the datatype of the vector operand. The UMINV instruction
2512 // will zero extend the result, so we can mark as known zero all the
2513 // bits larger than the element datatype. 32-bit or larget doesn't need
2514 // this as those are legal types and will be handled by isel directly.
2515 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2516 unsigned BitWidth = Known.getBitWidth();
2517 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2518 assert(BitWidth >= 8 && "Unexpected width!");
2520 Known.Zero |= Mask;
2521 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2522 assert(BitWidth >= 16 && "Unexpected width!");
2524 Known.Zero |= Mask;
2525 }
2526 break;
2527 } break;
2528 }
2529 }
2530 }
2531}
2532
2534 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2535 unsigned Depth) const {
2536 EVT VT = Op.getValueType();
2537 unsigned VTBits = VT.getScalarSizeInBits();
2538 unsigned Opcode = Op.getOpcode();
2539 switch (Opcode) {
2540 case AArch64ISD::CMEQ:
2541 case AArch64ISD::CMGE:
2542 case AArch64ISD::CMGT:
2543 case AArch64ISD::CMHI:
2544 case AArch64ISD::CMHS:
2545 case AArch64ISD::FCMEQ:
2546 case AArch64ISD::FCMGE:
2547 case AArch64ISD::FCMGT:
2548 case AArch64ISD::CMEQz:
2549 case AArch64ISD::CMGEz:
2550 case AArch64ISD::CMGTz:
2551 case AArch64ISD::CMLEz:
2552 case AArch64ISD::CMLTz:
2553 case AArch64ISD::FCMEQz:
2554 case AArch64ISD::FCMGEz:
2555 case AArch64ISD::FCMGTz:
2556 case AArch64ISD::FCMLEz:
2557 case AArch64ISD::FCMLTz:
2558 // Compares return either 0 or all-ones
2559 return VTBits;
2560 case AArch64ISD::VASHR: {
2561 unsigned Tmp =
2562 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2563 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2564 }
2565 }
2566
2567 return 1;
2568}
2569
2571 EVT) const {
2572 return MVT::i64;
2573}
2574
2576 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2577 unsigned *Fast) const {
2578
2579 // Allow SVE loads/stores where the alignment >= the size of the element type,
2580 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2581 // for stores that come from IR, only require element-size alignment (even if
2582 // unaligned accesses are disabled). Without this, these will be forced to
2583 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2584 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2585 if (VT.isScalableVector()) {
2586 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2587 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2588 return true;
2589 }
2590
2591 if (Subtarget->requiresStrictAlign())
2592 return false;
2593
2594 if (Fast) {
2595 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2596 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2597 // See comments in performSTORECombine() for more details about
2598 // these conditions.
2599
2600 // Code that uses clang vector extensions can mark that it
2601 // wants unaligned accesses to be treated as fast by
2602 // underspecifying alignment to be 1 or 2.
2603 Alignment <= 2 ||
2604
2605 // Disregard v2i64. Memcpy lowering produces those and splitting
2606 // them regresses performance on micro-benchmarks and olden/bh.
2607 VT == MVT::v2i64;
2608 }
2609 return true;
2610}
2611
2612// Same as above but handling LLTs instead.
2614 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2615 unsigned *Fast) const {
2616 if (Subtarget->requiresStrictAlign())
2617 return false;
2618
2619 if (Fast) {
2620 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2621 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2622 Ty.getSizeInBytes() != 16 ||
2623 // See comments in performSTORECombine() for more details about
2624 // these conditions.
2625
2626 // Code that uses clang vector extensions can mark that it
2627 // wants unaligned accesses to be treated as fast by
2628 // underspecifying alignment to be 1 or 2.
2629 Alignment <= 2 ||
2630
2631 // Disregard v2i64. Memcpy lowering produces those and splitting
2632 // them regresses performance on micro-benchmarks and olden/bh.
2633 Ty == LLT::fixed_vector(2, 64);
2634 }
2635 return true;
2636}
2637
2638FastISel *
2640 const TargetLibraryInfo *libInfo) const {
2641 return AArch64::createFastISel(funcInfo, libInfo);
2642}
2643
2644const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2645#define MAKE_CASE(V) \
2646 case V: \
2647 return #V;
2648 switch ((AArch64ISD::NodeType)Opcode) {
2650 break;
2979 }
2980#undef MAKE_CASE
2981 return nullptr;
2982}
2983
2986 MachineBasicBlock *MBB) const {
2987 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2988 // phi node:
2989
2990 // OrigBB:
2991 // [... previous instrs leading to comparison ...]
2992 // b.ne TrueBB
2993 // b EndBB
2994 // TrueBB:
2995 // ; Fallthrough
2996 // EndBB:
2997 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2998
2999 MachineFunction *MF = MBB->getParent();
3000 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3001 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3002 DebugLoc DL = MI.getDebugLoc();
3004
3005 Register DestReg = MI.getOperand(0).getReg();
3006 Register IfTrueReg = MI.getOperand(1).getReg();
3007 Register IfFalseReg = MI.getOperand(2).getReg();
3008 unsigned CondCode = MI.getOperand(3).getImm();
3009 bool NZCVKilled = MI.getOperand(4).isKill();
3010
3011 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3012 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3013 MF->insert(It, TrueBB);
3014 MF->insert(It, EndBB);
3015
3016 // Transfer rest of current basic-block to EndBB
3017 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3018 MBB->end());
3020
3021 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3022 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3023 MBB->addSuccessor(TrueBB);
3024 MBB->addSuccessor(EndBB);
3025
3026 // TrueBB falls through to the end.
3027 TrueBB->addSuccessor(EndBB);
3028
3029 if (!NZCVKilled) {
3030 TrueBB->addLiveIn(AArch64::NZCV);
3031 EndBB->addLiveIn(AArch64::NZCV);
3032 }
3033
3034 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3035 .addReg(IfTrueReg)
3036 .addMBB(TrueBB)
3037 .addReg(IfFalseReg)
3038 .addMBB(MBB);
3039
3040 MI.eraseFromParent();
3041 return EndBB;
3042}
3043
3045 MachineInstr &MI, MachineBasicBlock *BB) const {
3047 BB->getParent()->getFunction().getPersonalityFn())) &&
3048 "SEH does not use catchret!");
3049 return BB;
3050}
3051
3054 MachineBasicBlock *MBB) const {
3055 MachineFunction &MF = *MBB->getParent();
3056 MachineBasicBlock::iterator MBBI = MI.getIterator();
3058 const AArch64InstrInfo &TII =
3059 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3060 Register TargetReg = MI.getOperand(0).getReg();
3062 TII.probedStackAlloc(MBBI, TargetReg, false);
3063
3064 MI.eraseFromParent();
3065 return NextInst->getParent();
3066}
3067
3069AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3071 MachineBasicBlock *BB) const {
3072 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3073 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3074
3075 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3076 MIB.add(MI.getOperand(1)); // slice index register
3077 MIB.add(MI.getOperand(2)); // slice index offset
3078 MIB.add(MI.getOperand(3)); // pg
3079 MIB.add(MI.getOperand(4)); // base
3080 MIB.add(MI.getOperand(5)); // offset
3081
3082 MI.eraseFromParent(); // The pseudo is gone now.
3083 return BB;
3084}
3085
3088 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3090 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3091
3092 MIB.addReg(AArch64::ZA, RegState::Define);
3093 MIB.add(MI.getOperand(0)); // Vector select register
3094 MIB.add(MI.getOperand(1)); // Vector select offset
3095 MIB.add(MI.getOperand(2)); // Base
3096 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3097
3098 MI.eraseFromParent(); // The pseudo is gone now.
3099 return BB;
3100}
3101
3104 unsigned Opcode,
3105 bool Op0IsDef) const {
3106 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3108
3109 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3110 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3111 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3112 MIB.add(MI.getOperand(I));
3113
3114 MI.eraseFromParent(); // The pseudo is gone now.
3115 return BB;
3116}
3117
3119AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3121 MachineBasicBlock *BB) const {
3122 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3123 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3124 unsigned StartIdx = 0;
3125
3126 bool HasTile = BaseReg != AArch64::ZA;
3127 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3128 if (HasZPROut) {
3129 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3130 ++StartIdx;
3131 }
3132 if (HasTile) {
3133 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3134 RegState::Define); // Output ZA Tile
3135 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3136 StartIdx++;
3137 } else {
3138 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3139 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3140 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3141 ++StartIdx;
3142 }
3143 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3144 }
3145 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3146 MIB.add(MI.getOperand(I));
3147
3148 MI.eraseFromParent(); // The pseudo is gone now.
3149 return BB;
3150}
3151
3154 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3156 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3157 MIB.add(MI.getOperand(0)); // Mask
3158
3159 unsigned Mask = MI.getOperand(0).getImm();
3160 for (unsigned I = 0; I < 8; I++) {
3161 if (Mask & (1 << I))
3162 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3163 }
3164
3165 MI.eraseFromParent(); // The pseudo is gone now.
3166 return BB;
3167}
3168
3171 MachineBasicBlock *BB) const {
3172 MachineFunction *MF = BB->getParent();
3173 MachineFrameInfo &MFI = MF->getFrameInfo();
3175 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3176 if (TPIDR2.Uses > 0) {
3177 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3178 // Store the buffer pointer to the TPIDR2 stack object.
3179 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3180 .addReg(MI.getOperand(0).getReg())
3181 .addFrameIndex(TPIDR2.FrameIndex)
3182 .addImm(0);
3183 // Set the reserved bytes (10-15) to zero
3184 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3185 .addReg(AArch64::WZR)
3186 .addFrameIndex(TPIDR2.FrameIndex)
3187 .addImm(5);
3188 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3189 .addReg(AArch64::WZR)
3190 .addFrameIndex(TPIDR2.FrameIndex)
3191 .addImm(3);
3192 } else
3193 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3194
3195 BB->remove_instr(&MI);
3196 return BB;
3197}
3198
3201 MachineBasicBlock *BB) const {
3202 MachineFunction *MF = BB->getParent();
3203 MachineFrameInfo &MFI = MF->getFrameInfo();
3205 // TODO This function grows the stack with a subtraction, which doesn't work
3206 // on Windows. Some refactoring to share the functionality in
3207 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3208 // supports SME
3210 "Lazy ZA save is not yet supported on Windows");
3211
3212 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3213
3214 if (TPIDR2.Uses > 0) {
3215 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3217
3218 // The SUBXrs below won't always be emitted in a form that accepts SP
3219 // directly
3220 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3221 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3222 .addReg(AArch64::SP);
3223
3224 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3225 auto Size = MI.getOperand(1).getReg();
3226 auto Dest = MI.getOperand(0).getReg();
3227 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3228 .addReg(Size)
3229 .addReg(Size)
3230 .addReg(SP);
3231 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3232 AArch64::SP)
3233 .addReg(Dest);
3234
3235 // We have just allocated a variable sized object, tell this to PEI.
3236 MFI.CreateVariableSizedObject(Align(16), nullptr);
3237 }
3238
3239 BB->remove_instr(&MI);
3240 return BB;
3241}
3242
3243// TODO: Find a way to merge this with EmitAllocateZABuffer.
3246 MachineBasicBlock *BB) const {
3247 MachineFunction *MF = BB->getParent();
3248 MachineFrameInfo &MFI = MF->getFrameInfo();
3251 "Lazy ZA save is not yet supported on Windows");
3252
3253 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3254 if (FuncInfo->isSMESaveBufferUsed()) {
3255 // Allocate a buffer object of the size given by MI.getOperand(1).
3256 auto Size = MI.getOperand(1).getReg();
3257 auto Dest = MI.getOperand(0).getReg();
3258 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3259 .addReg(AArch64::SP)
3260 .addReg(Size)
3262 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3263 .addReg(AArch64::SP);
3264
3265 // We have just allocated a variable sized object, tell this to PEI.
3266 MFI.CreateVariableSizedObject(Align(16), nullptr);
3267 } else
3268 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3269 MI.getOperand(0).getReg());
3270
3271 BB->remove_instr(&MI);
3272 return BB;
3273}
3274
3277 MachineBasicBlock *BB) const {
3278 // If the buffer is used, emit a call to __arm_sme_state_size()
3279 MachineFunction *MF = BB->getParent();
3281 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3282 if (FuncInfo->isSMESaveBufferUsed()) {
3283 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3284 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3285 .addExternalSymbol("__arm_sme_state_size")
3286 .addReg(AArch64::X0, RegState::ImplicitDefine)
3287 .addRegMask(TRI->getCallPreservedMask(
3288 *MF, CallingConv::
3290 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3291 MI.getOperand(0).getReg())
3292 .addReg(AArch64::X0);
3293 } else
3294 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3295 MI.getOperand(0).getReg())
3296 .addReg(AArch64::XZR);
3297 BB->remove_instr(&MI);
3298 return BB;
3299}
3300
3302 MachineInstr &MI, MachineBasicBlock *BB) const {
3303
3304 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3305 if (SMEOrigInstr != -1) {
3306 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3307 uint64_t SMEMatrixType =
3308 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3309 switch (SMEMatrixType) {
3311 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3313 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3315 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3317 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3319 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3321 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3322 }
3323 }
3324
3325 switch (MI.getOpcode()) {
3326 default:
3327#ifndef NDEBUG
3328 MI.dump();
3329#endif
3330 llvm_unreachable("Unexpected instruction for custom inserter!");
3331 case AArch64::InitTPIDR2Obj:
3332 return EmitInitTPIDR2Object(MI, BB);
3333 case AArch64::AllocateZABuffer:
3334 return EmitAllocateZABuffer(MI, BB);
3335 case AArch64::AllocateSMESaveBuffer:
3336 return EmitAllocateSMESaveBuffer(MI, BB);
3337 case AArch64::GetSMESaveSize:
3338 return EmitGetSMESaveSize(MI, BB);
3339 case AArch64::F128CSEL:
3340 return EmitF128CSEL(MI, BB);
3341 case TargetOpcode::STATEPOINT:
3342 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3343 // while bl call instruction (where statepoint will be lowered at the end)
3344 // has implicit def. This def is early-clobber as it will be set at
3345 // the moment of the call and earlier than any use is read.
3346 // Add this implicit dead def here as a workaround.
3347 MI.addOperand(*MI.getMF(),
3349 AArch64::LR, /*isDef*/ true,
3350 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3351 /*isUndef*/ false, /*isEarlyClobber*/ true));
3352 [[fallthrough]];
3353 case TargetOpcode::STACKMAP:
3354 case TargetOpcode::PATCHPOINT:
3355 return emitPatchPoint(MI, BB);
3356
3357 case TargetOpcode::PATCHABLE_EVENT_CALL:
3358 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3359 return BB;
3360
3361 case AArch64::CATCHRET:
3362 return EmitLoweredCatchRet(MI, BB);
3363
3364 case AArch64::PROBED_STACKALLOC_DYN:
3365 return EmitDynamicProbedAlloc(MI, BB);
3366
3367 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3368 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3369 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3370 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3371 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3372 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3373 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3374 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3375 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3376 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3377 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3378 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3379 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3380 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3381 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3382 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3383 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3384 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3385 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3386 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3387 case AArch64::LDR_ZA_PSEUDO:
3388 return EmitFill(MI, BB);
3389 case AArch64::LDR_TX_PSEUDO:
3390 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3391 case AArch64::STR_TX_PSEUDO:
3392 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3393 case AArch64::ZERO_M_PSEUDO:
3394 return EmitZero(MI, BB);
3395 case AArch64::ZERO_T_PSEUDO:
3396 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3397 case AArch64::MOVT_TIZ_PSEUDO:
3398 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3399 }
3400}
3401
3402//===----------------------------------------------------------------------===//
3403// AArch64 Lowering private implementation.
3404//===----------------------------------------------------------------------===//
3405
3406//===----------------------------------------------------------------------===//
3407// Lowering Code
3408//===----------------------------------------------------------------------===//
3409
3410// Forward declarations of SVE fixed length lowering helpers
3415 SelectionDAG &DAG);
3418 EVT VT);
3419
3420/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3421static bool isZerosVector(const SDNode *N) {
3422 // Look through a bit convert.
3423 while (N->getOpcode() == ISD::BITCAST)
3424 N = N->getOperand(0).getNode();
3425
3427 return true;
3428
3429 if (N->getOpcode() != AArch64ISD::DUP)
3430 return false;
3431
3432 auto Opnd0 = N->getOperand(0);
3433 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3434}
3435
3436/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3437/// CC
3439 switch (CC) {
3440 default:
3441 llvm_unreachable("Unknown condition code!");
3442 case ISD::SETNE:
3443 return AArch64CC::NE;
3444 case ISD::SETEQ:
3445 return AArch64CC::EQ;
3446 case ISD::SETGT:
3447 return AArch64CC::GT;
3448 case ISD::SETGE:
3449 return AArch64CC::GE;
3450 case ISD::SETLT:
3451 return AArch64CC::LT;
3452 case ISD::SETLE:
3453 return AArch64CC::LE;
3454 case ISD::SETUGT:
3455 return AArch64CC::HI;
3456 case ISD::SETUGE:
3457 return AArch64CC::HS;
3458 case ISD::SETULT:
3459 return AArch64CC::LO;
3460 case ISD::SETULE:
3461 return AArch64CC::LS;
3462 }
3463}
3464
3465/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3467 AArch64CC::CondCode &CondCode,
3468 AArch64CC::CondCode &CondCode2) {
3469 CondCode2 = AArch64CC::AL;
3470 switch (CC) {
3471 default:
3472 llvm_unreachable("Unknown FP condition!");
3473 case ISD::SETEQ:
3474 case ISD::SETOEQ:
3475 CondCode = AArch64CC::EQ;
3476 break;
3477 case ISD::SETGT:
3478 case ISD::SETOGT:
3479 CondCode = AArch64CC::GT;
3480 break;
3481 case ISD::SETGE:
3482 case ISD::SETOGE:
3483 CondCode = AArch64CC::GE;
3484 break;
3485 case ISD::SETOLT:
3486 CondCode = AArch64CC::MI;
3487 break;
3488 case ISD::SETOLE:
3489 CondCode = AArch64CC::LS;
3490 break;
3491 case ISD::SETONE:
3492 CondCode = AArch64CC::MI;
3493 CondCode2 = AArch64CC::GT;
3494 break;
3495 case ISD::SETO:
3496 CondCode = AArch64CC::VC;
3497 break;
3498 case ISD::SETUO:
3499 CondCode = AArch64CC::VS;
3500 break;
3501 case ISD::SETUEQ:
3502 CondCode = AArch64CC::EQ;
3503 CondCode2 = AArch64CC::VS;
3504 break;
3505 case ISD::SETUGT:
3506 CondCode = AArch64CC::HI;
3507 break;
3508 case ISD::SETUGE:
3509 CondCode = AArch64CC::PL;
3510 break;
3511 case ISD::SETLT:
3512 case ISD::SETULT:
3513 CondCode = AArch64CC::LT;
3514 break;
3515 case ISD::SETLE:
3516 case ISD::SETULE:
3517 CondCode = AArch64CC::LE;
3518 break;
3519 case ISD::SETNE:
3520 case ISD::SETUNE:
3521 CondCode = AArch64CC::NE;
3522 break;
3523 }
3524}
3525
3526/// Convert a DAG fp condition code to an AArch64 CC.
3527/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3528/// should be AND'ed instead of OR'ed.
3530 AArch64CC::CondCode &CondCode,
3531 AArch64CC::CondCode &CondCode2) {
3532 CondCode2 = AArch64CC::AL;
3533 switch (CC) {
3534 default:
3535 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3536 assert(CondCode2 == AArch64CC::AL);
3537 break;
3538 case ISD::SETONE:
3539 // (a one b)
3540 // == ((a olt b) || (a ogt b))
3541 // == ((a ord b) && (a une b))
3542 CondCode = AArch64CC::VC;
3543 CondCode2 = AArch64CC::NE;
3544 break;
3545 case ISD::SETUEQ:
3546 // (a ueq b)
3547 // == ((a uno b) || (a oeq b))
3548 // == ((a ule b) && (a uge b))
3549 CondCode = AArch64CC::PL;
3550 CondCode2 = AArch64CC::LE;
3551 break;
3552 }
3553}
3554
3555/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3556/// CC usable with the vector instructions. Fewer operations are available
3557/// without a real NZCV register, so we have to use less efficient combinations
3558/// to get the same effect.
3560 AArch64CC::CondCode &CondCode,
3561 AArch64CC::CondCode &CondCode2,
3562 bool &Invert) {
3563 Invert = false;
3564 switch (CC) {
3565 default:
3566 // Mostly the scalar mappings work fine.
3567 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3568 break;
3569 case ISD::SETUO:
3570 Invert = true;
3571 [[fallthrough]];
3572 case ISD::SETO:
3573 CondCode = AArch64CC::MI;
3574 CondCode2 = AArch64CC::GE;
3575 break;
3576 case ISD::SETUEQ:
3577 case ISD::SETULT:
3578 case ISD::SETULE:
3579 case ISD::SETUGT:
3580 case ISD::SETUGE:
3581 // All of the compare-mask comparisons are ordered, but we can switch
3582 // between the two by a double inversion. E.g. ULE == !OGT.
3583 Invert = true;
3584 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3585 CondCode, CondCode2);
3586 break;
3587 }
3588}
3589
3591 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3592 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3593 LLVM_DEBUG(dbgs() << "Is imm " << C
3594 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3595 return IsLegal;
3596}
3597
3598static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3599 KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
3600 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3601}
3602
3603// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3604// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3605// can be set differently by this operation. It comes down to whether
3606// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3607// everything is fine. If not then the optimization is wrong. Thus general
3608// comparisons are only valid if op2 != 0.
3609//
3610// So, finally, the only LLVM-native comparisons that don't mention C or V
3611// are the ones that aren't unsigned comparisons. They're the only ones we can
3612// safely use CMN for in the absence of information about op2.
3614 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3615 (isIntEqualitySetCC(CC) ||
3616 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3617 (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
3618}
3619
3621 SelectionDAG &DAG, SDValue Chain,
3622 bool IsSignaling) {
3623 EVT VT = LHS.getValueType();
3624 assert(VT != MVT::f128);
3625
3626 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3627
3628 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3629 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3630 {Chain, LHS});
3631 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3632 {LHS.getValue(1), RHS});
3633 Chain = RHS.getValue(1);
3634 }
3635 unsigned Opcode =
3637 return DAG.getNode(Opcode, dl, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
3638}
3639
3641 const SDLoc &dl, SelectionDAG &DAG) {
3642 EVT VT = LHS.getValueType();
3643 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3644
3645 if (VT.isFloatingPoint()) {
3646 assert(VT != MVT::f128);
3647 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3648 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3649 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3650 }
3651 return DAG.getNode(AArch64ISD::FCMP, dl, MVT::i32, LHS, RHS);
3652 }
3653
3654 // The CMP instruction is just an alias for SUBS, and representing it as
3655 // SUBS means that it's possible to get CSE with subtract operations.
3656 // A later phase can perform the optimization of setting the destination
3657 // register to WZR/XZR if it ends up being unused.
3658 unsigned Opcode = AArch64ISD::SUBS;
3659
3660 if (isCMN(RHS, CC, DAG)) {
3661 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3662 Opcode = AArch64ISD::ADDS;
3663 RHS = RHS.getOperand(1);
3664 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3665 isIntEqualitySetCC(CC)) {
3666 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3667 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3668 Opcode = AArch64ISD::ADDS;
3669 LHS = LHS.getOperand(1);
3670 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3671 if (LHS.getOpcode() == ISD::AND) {
3672 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3673 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3674 // of the signed comparisons.
3675 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3676 DAG.getVTList(VT, MVT_CC),
3677 LHS.getOperand(0),
3678 LHS.getOperand(1));
3679 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3680 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3681 return ANDSNode.getValue(1);
3682 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3683 // Use result of ANDS
3684 return LHS.getValue(1);
3685 }
3686 }
3687
3688 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3689 .getValue(1);
3690}
3691
3692/// \defgroup AArch64CCMP CMP;CCMP matching
3693///
3694/// These functions deal with the formation of CMP;CCMP;... sequences.
3695/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3696/// a comparison. They set the NZCV flags to a predefined value if their
3697/// predicate is false. This allows to express arbitrary conjunctions, for
3698/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3699/// expressed as:
3700/// cmp A
3701/// ccmp B, inv(CB), CA
3702/// check for CB flags
3703///
3704/// This naturally lets us implement chains of AND operations with SETCC
3705/// operands. And we can even implement some other situations by transforming
3706/// them:
3707/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3708/// negating the flags used in a CCMP/FCCMP operations.
3709/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3710/// by negating the flags we test for afterwards. i.e.
3711/// NEG (CMP CCMP CCCMP ...) can be implemented.
3712/// - Note that we can only ever negate all previously processed results.
3713/// What we can not implement by flipping the flags to test is a negation
3714/// of two sub-trees (because the negation affects all sub-trees emitted so
3715/// far, so the 2nd sub-tree we emit would also affect the first).
3716/// With those tools we can implement some OR operations:
3717/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3718/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3719/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3720/// elimination rules from earlier to implement the whole thing as a
3721/// CCMP/FCCMP chain.
3722///
3723/// As complete example:
3724/// or (or (setCA (cmp A)) (setCB (cmp B)))
3725/// (and (setCC (cmp C)) (setCD (cmp D)))"
3726/// can be reassociated to:
3727/// or (and (setCC (cmp C)) setCD (cmp D))
3728// (or (setCA (cmp A)) (setCB (cmp B)))
3729/// can be transformed to:
3730/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3731/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3732/// which can be implemented as:
3733/// cmp C
3734/// ccmp D, inv(CD), CC
3735/// ccmp A, CA, inv(CD)
3736/// ccmp B, CB, inv(CA)
3737/// check for CB flags
3738///
3739/// A counterexample is "or (and A B) (and C D)" which translates to
3740/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3741/// can only implement 1 of the inner (not) operations, but not both!
3742/// @{
3743
3744/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3746 ISD::CondCode CC, SDValue CCOp,
3747 AArch64CC::CondCode Predicate,
3748 AArch64CC::CondCode OutCC,
3749 const SDLoc &DL, SelectionDAG &DAG) {
3750 unsigned Opcode = 0;
3751 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3752
3753 if (LHS.getValueType().isFloatingPoint()) {
3754 assert(LHS.getValueType() != MVT::f128);
3755 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3756 LHS.getValueType() == MVT::bf16) {
3757 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3758 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3759 }
3760 Opcode = AArch64ISD::FCCMP;
3761 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3762 APInt Imm = Const->getAPIntValue();
3763 if (Imm.isNegative() && Imm.sgt(-32)) {
3764 Opcode = AArch64ISD::CCMN;
3765 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3766 }
3767 } else if (isCMN(RHS, CC, DAG)) {
3768 Opcode = AArch64ISD::CCMN;
3769 RHS = RHS.getOperand(1);
3770 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3771 isIntEqualitySetCC(CC)) {
3772 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3773 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3774 Opcode = AArch64ISD::CCMN;
3775 LHS = LHS.getOperand(1);
3776 }
3777 if (Opcode == 0)
3778 Opcode = AArch64ISD::CCMP;
3779
3780 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3782 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3783 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3784 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3785}
3786
3787/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3788/// expressed as a conjunction. See \ref AArch64CCMP.
3789/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3790/// changing the conditions on the SETCC tests.
3791/// (this means we can call emitConjunctionRec() with
3792/// Negate==true on this sub-tree)
3793/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3794/// cannot do the negation naturally. We are required to
3795/// emit the subtree first in this case.
3796/// \param WillNegate Is true if are called when the result of this
3797/// subexpression must be negated. This happens when the
3798/// outer expression is an OR. We can use this fact to know
3799/// that we have a double negation (or (or ...) ...) that
3800/// can be implemented for free.
3801static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3802 bool &MustBeFirst, bool WillNegate,
3803 unsigned Depth = 0) {
3804 if (!Val.hasOneUse())
3805 return false;
3806 unsigned Opcode = Val->getOpcode();
3807 if (Opcode == ISD::SETCC) {
3808 if (Val->getOperand(0).getValueType() == MVT::f128)
3809 return false;
3810 CanNegate = true;
3811 MustBeFirst = false;
3812 return true;
3813 }
3814 // Protect against exponential runtime and stack overflow.
3815 if (Depth > 6)
3816 return false;
3817 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3818 bool IsOR = Opcode == ISD::OR;
3819 SDValue O0 = Val->getOperand(0);
3820 SDValue O1 = Val->getOperand(1);
3821 bool CanNegateL;
3822 bool MustBeFirstL;
3823 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3824 return false;
3825 bool CanNegateR;
3826 bool MustBeFirstR;
3827 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3828 return false;
3829
3830 if (MustBeFirstL && MustBeFirstR)
3831 return false;
3832
3833 if (IsOR) {
3834 // For an OR expression we need to be able to naturally negate at least
3835 // one side or we cannot do the transformation at all.
3836 if (!CanNegateL && !CanNegateR)
3837 return false;
3838 // If we the result of the OR will be negated and we can naturally negate
3839 // the leafs, then this sub-tree as a whole negates naturally.
3840 CanNegate = WillNegate && CanNegateL && CanNegateR;
3841 // If we cannot naturally negate the whole sub-tree, then this must be
3842 // emitted first.
3843 MustBeFirst = !CanNegate;
3844 } else {
3845 assert(Opcode == ISD::AND && "Must be OR or AND");
3846 // We cannot naturally negate an AND operation.
3847 CanNegate = false;
3848 MustBeFirst = MustBeFirstL || MustBeFirstR;
3849 }
3850 return true;
3851 }
3852 return false;
3853}
3854
3855/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3856/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3857/// Tries to transform the given i1 producing node @p Val to a series compare
3858/// and conditional compare operations. @returns an NZCV flags producing node
3859/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3860/// transformation was not possible.
3861/// \p Negate is true if we want this sub-tree being negated just by changing
3862/// SETCC conditions.
3864 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3865 AArch64CC::CondCode Predicate) {
3866 // We're at a tree leaf, produce a conditional comparison operation.
3867 unsigned Opcode = Val->getOpcode();
3868 if (Opcode == ISD::SETCC) {
3869 SDValue LHS = Val->getOperand(0);
3870 SDValue RHS = Val->getOperand(1);
3871 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3872 bool isInteger = LHS.getValueType().isInteger();
3873 if (Negate)
3874 CC = getSetCCInverse(CC, LHS.getValueType());
3875 SDLoc DL(Val);
3876 // Determine OutCC and handle FP special case.
3877 if (isInteger) {
3878 OutCC = changeIntCCToAArch64CC(CC);
3879 } else {
3880 assert(LHS.getValueType().isFloatingPoint());
3881 AArch64CC::CondCode ExtraCC;
3882 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3883 // Some floating point conditions can't be tested with a single condition
3884 // code. Construct an additional comparison in this case.
3885 if (ExtraCC != AArch64CC::AL) {
3886 SDValue ExtraCmp;
3887 if (!CCOp.getNode())
3888 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3889 else
3890 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3891 ExtraCC, DL, DAG);
3892 CCOp = ExtraCmp;
3893 Predicate = ExtraCC;
3894 }
3895 }
3896
3897 // Produce a normal comparison if we are first in the chain
3898 if (!CCOp)
3899 return emitComparison(LHS, RHS, CC, DL, DAG);
3900 // Otherwise produce a ccmp.
3901 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3902 DAG);
3903 }
3904 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3905
3906 bool IsOR = Opcode == ISD::OR;
3907
3908 SDValue LHS = Val->getOperand(0);
3909 bool CanNegateL;
3910 bool MustBeFirstL;
3911 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3912 assert(ValidL && "Valid conjunction/disjunction tree");
3913 (void)ValidL;
3914
3915 SDValue RHS = Val->getOperand(1);
3916 bool CanNegateR;
3917 bool MustBeFirstR;
3918 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3919 assert(ValidR && "Valid conjunction/disjunction tree");
3920 (void)ValidR;
3921
3922 // Swap sub-tree that must come first to the right side.
3923 if (MustBeFirstL) {
3924 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3925 std::swap(LHS, RHS);
3926 std::swap(CanNegateL, CanNegateR);
3927 std::swap(MustBeFirstL, MustBeFirstR);
3928 }
3929
3930 bool NegateR;
3931 bool NegateAfterR;
3932 bool NegateL;
3933 bool NegateAfterAll;
3934 if (Opcode == ISD::OR) {
3935 // Swap the sub-tree that we can negate naturally to the left.
3936 if (!CanNegateL) {
3937 assert(CanNegateR && "at least one side must be negatable");
3938 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3939 assert(!Negate);
3940 std::swap(LHS, RHS);
3941 NegateR = false;
3942 NegateAfterR = true;
3943 } else {
3944 // Negate the left sub-tree if possible, otherwise negate the result.
3945 NegateR = CanNegateR;
3946 NegateAfterR = !CanNegateR;
3947 }
3948 NegateL = true;
3949 NegateAfterAll = !Negate;
3950 } else {
3951 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3952 assert(!Negate && "Valid conjunction/disjunction tree");
3953
3954 NegateL = false;
3955 NegateR = false;
3956 NegateAfterR = false;
3957 NegateAfterAll = false;
3958 }
3959
3960 // Emit sub-trees.
3961 AArch64CC::CondCode RHSCC;
3962 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3963 if (NegateAfterR)
3964 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3965 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3966 if (NegateAfterAll)
3967 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3968 return CmpL;
3969}
3970
3971/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3972/// In some cases this is even possible with OR operations in the expression.
3973/// See \ref AArch64CCMP.
3974/// \see emitConjunctionRec().
3976 AArch64CC::CondCode &OutCC) {
3977 bool DummyCanNegate;
3978 bool DummyMustBeFirst;
3979 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3980 return SDValue();
3981
3982 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3983}
3984
3985/// @}
3986
3987/// Returns how profitable it is to fold a comparison's operand's shift and/or
3988/// extension operations.
3990 auto isSupportedExtend = [&](SDValue V) {
3991 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3992 return true;
3993
3994 if (V.getOpcode() == ISD::AND)
3995 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3996 uint64_t Mask = MaskCst->getZExtValue();
3997 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3998 }
3999
4000 return false;
4001 };
4002
4003 if (!Op.hasOneUse())
4004 return 0;
4005
4006 if (isSupportedExtend(Op))
4007 return 1;
4008
4009 unsigned Opc = Op.getOpcode();
4010 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4011 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4012 uint64_t Shift = ShiftCst->getZExtValue();
4013 if (isSupportedExtend(Op.getOperand(0)))
4014 return (Shift <= 4) ? 2 : 1;
4015 EVT VT = Op.getValueType();
4016 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4017 return 1;
4018 }
4019
4020 return 0;
4021}
4022
4024 SDValue &AArch64cc, SelectionDAG &DAG,
4025 const SDLoc &dl) {
4026 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4027 EVT VT = RHS.getValueType();
4028 uint64_t C = RHSC->getZExtValue();
4029 if (!isLegalArithImmed(C)) {
4030 // Constant does not fit, try adjusting it by one?
4031 switch (CC) {
4032 default:
4033 break;
4034 case ISD::SETLT:
4035 case ISD::SETGE:
4036 if ((VT == MVT::i32 && C != 0x80000000 &&
4037 isLegalArithImmed((uint32_t)(C - 1))) ||
4038 (VT == MVT::i64 && C != 0x80000000ULL &&
4039 isLegalArithImmed(C - 1ULL))) {
4041 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4042 RHS = DAG.getConstant(C, dl, VT);
4043 }
4044 break;
4045 case ISD::SETULT:
4046 case ISD::SETUGE:
4047 if ((VT == MVT::i32 && C != 0 &&
4048 isLegalArithImmed((uint32_t)(C - 1))) ||
4049 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
4051 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4052 RHS = DAG.getConstant(C, dl, VT);
4053 }
4054 break;
4055 case ISD::SETLE:
4056 case ISD::SETGT:
4057 if ((VT == MVT::i32 && C != INT32_MAX &&
4058 isLegalArithImmed((uint32_t)(C + 1))) ||
4059 (VT == MVT::i64 && C != INT64_MAX &&
4060 isLegalArithImmed(C + 1ULL))) {
4062 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4063 RHS = DAG.getConstant(C, dl, VT);
4064 }
4065 break;
4066 case ISD::SETULE:
4067 case ISD::SETUGT:
4068 if ((VT == MVT::i32 && C != UINT32_MAX &&
4069 isLegalArithImmed((uint32_t)(C + 1))) ||
4070 (VT == MVT::i64 && C != UINT64_MAX &&
4071 isLegalArithImmed(C + 1ULL))) {
4073 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4074 RHS = DAG.getConstant(C, dl, VT);
4075 }
4076 break;
4077 }
4078 }
4079 }
4080
4081 // Comparisons are canonicalized so that the RHS operand is simpler than the
4082 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4083 // can fold some shift+extend operations on the RHS operand, so swap the
4084 // operands if that can be done.
4085 //
4086 // For example:
4087 // lsl w13, w11, #1
4088 // cmp w13, w12
4089 // can be turned into:
4090 // cmp w12, w11, lsl #1
4091 if (!isa<ConstantSDNode>(RHS) ||
4092 !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
4093 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4094 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4095 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4096 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4097
4098 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4099 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4100 std::swap(LHS, RHS);
4102 }
4103 }
4104
4105 SDValue Cmp;
4106 AArch64CC::CondCode AArch64CC;
4107 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4108 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
4109
4110 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4111 // For the i8 operand, the largest immediate is 255, so this can be easily
4112 // encoded in the compare instruction. For the i16 operand, however, the
4113 // largest immediate cannot be encoded in the compare.
4114 // Therefore, use a sign extending load and cmn to avoid materializing the
4115 // -1 constant. For example,
4116 // movz w1, #65535
4117 // ldrh w0, [x0, #0]
4118 // cmp w0, w1
4119 // >
4120 // ldrsh w0, [x0, #0]
4121 // cmn w0, #1
4122 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4123 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4124 // ensure both the LHS and RHS are truly zero extended and to make sure the
4125 // transformation is profitable.
4126 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4127 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4128 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4129 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4130 int16_t ValueofRHS = RHS->getAsZExtVal();
4131 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4132 SDValue SExt =
4133 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
4134 DAG.getValueType(MVT::i16));
4135 Cmp = emitComparison(
4136 SExt, DAG.getSignedConstant(ValueofRHS, dl, RHS.getValueType()), CC,
4137 dl, DAG);
4138 AArch64CC = changeIntCCToAArch64CC(CC);
4139 }
4140 }
4141
4142 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4143 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4144 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4145 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
4146 }
4147 }
4148 }
4149
4150 if (!Cmp) {
4151 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4152 AArch64CC = changeIntCCToAArch64CC(CC);
4153 }
4154 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
4155 return Cmp;
4156}
4157
4158static std::pair<SDValue, SDValue>
4160 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4161 "Unsupported value type");
4162 SDValue Value, Overflow;
4163 SDLoc DL(Op);
4164 SDValue LHS = Op.getOperand(0);
4165 SDValue RHS = Op.getOperand(1);
4166 unsigned Opc = 0;
4167 switch (Op.getOpcode()) {
4168 default:
4169 llvm_unreachable("Unknown overflow instruction!");
4170 case ISD::SADDO:
4171 Opc = AArch64ISD::ADDS;
4172 CC = AArch64CC::VS;
4173 break;
4174 case ISD::UADDO:
4175 Opc = AArch64ISD::ADDS;
4176 CC = AArch64CC::HS;
4177 break;
4178 case ISD::SSUBO:
4179 Opc = AArch64ISD::SUBS;
4180 CC = AArch64CC::VS;
4181 break;
4182 case ISD::USUBO:
4183 Opc = AArch64ISD::SUBS;
4184 CC = AArch64CC::LO;
4185 break;
4186 // Multiply needs a little bit extra work.
4187 case ISD::SMULO:
4188 case ISD::UMULO: {
4189 CC = AArch64CC::NE;
4190 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4191 if (Op.getValueType() == MVT::i32) {
4192 // Extend to 64-bits, then perform a 64-bit multiply.
4193 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4194 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4195 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4196 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4197 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4198
4199 // Check that the result fits into a 32-bit integer.
4200 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
4201 if (IsSigned) {
4202 // cmp xreg, wreg, sxtw
4203 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4204 Overflow =
4205 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4206 } else {
4207 // tst xreg, #0xffffffff00000000
4208 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4209 Overflow =
4210 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4211 }
4212 break;
4213 }
4214 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4215 // For the 64 bit multiply
4216 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4217 if (IsSigned) {
4218 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4219 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4220 DAG.getConstant(63, DL, MVT::i64));
4221 // It is important that LowerBits is last, otherwise the arithmetic
4222 // shift will not be folded into the compare (SUBS).
4223 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4224 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4225 .getValue(1);
4226 } else {
4227 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4228 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4229 Overflow =
4230 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4231 DAG.getConstant(0, DL, MVT::i64),
4232 UpperBits).getValue(1);
4233 }
4234 break;
4235 }
4236 } // switch (...)
4237
4238 if (Opc) {
4239 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4240
4241 // Emit the AArch64 operation with overflow check.
4242 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4243 Overflow = Value.getValue(1);
4244 }
4245 return std::make_pair(Value, Overflow);
4246}
4247
4248SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4249 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4250 !Subtarget->isNeonAvailable()))
4251 return LowerToScalableOp(Op, DAG);
4252
4253 SDValue Sel = Op.getOperand(0);
4254 SDValue Other = Op.getOperand(1);
4255 SDLoc dl(Sel);
4256
4257 // If the operand is an overflow checking operation, invert the condition
4258 // code and kill the Not operation. I.e., transform:
4259 // (xor (overflow_op_bool, 1))
4260 // -->
4261 // (csel 1, 0, invert(cc), overflow_op_bool)
4262 // ... which later gets transformed to just a cset instruction with an
4263 // inverted condition code, rather than a cset + eor sequence.
4265 // Only lower legal XALUO ops.
4267 return SDValue();
4268
4269 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4270 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4272 SDValue Value, Overflow;
4273 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4274 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4275 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4276 CCVal, Overflow);
4277 }
4278 // If neither operand is a SELECT_CC, give up.
4279 if (Sel.getOpcode() != ISD::SELECT_CC)
4280 std::swap(Sel, Other);
4281 if (Sel.getOpcode() != ISD::SELECT_CC)
4282 return Op;
4283
4284 // The folding we want to perform is:
4285 // (xor x, (select_cc a, b, cc, 0, -1) )
4286 // -->
4287 // (csel x, (xor x, -1), cc ...)
4288 //
4289 // The latter will get matched to a CSINV instruction.
4290
4291 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4292 SDValue LHS = Sel.getOperand(0);
4293 SDValue RHS = Sel.getOperand(1);
4294 SDValue TVal = Sel.getOperand(2);
4295 SDValue FVal = Sel.getOperand(3);
4296
4297 // FIXME: This could be generalized to non-integer comparisons.
4298 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4299 return Op;
4300
4301 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4302 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4303
4304 // The values aren't constants, this isn't the pattern we're looking for.
4305 if (!CFVal || !CTVal)
4306 return Op;
4307
4308 // We can commute the SELECT_CC by inverting the condition. This
4309 // might be needed to make this fit into a CSINV pattern.
4310 if (CTVal->isAllOnes() && CFVal->isZero()) {
4311 std::swap(TVal, FVal);
4312 std::swap(CTVal, CFVal);
4313 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4314 }
4315
4316 // If the constants line up, perform the transform!
4317 if (CTVal->isZero() && CFVal->isAllOnes()) {
4318 SDValue CCVal;
4319 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4320
4321 FVal = Other;
4322 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4323 DAG.getAllOnesConstant(dl, Other.getValueType()));
4324
4325 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4326 CCVal, Cmp);
4327 }
4328
4329 return Op;
4330}
4331
4332// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4333// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4334// sets 'C' bit to 0.
4336 SDLoc DL(Value);
4337 EVT VT = Value.getValueType();
4338 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4339 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4340 SDValue Cmp =
4341 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4342 return Cmp.getValue(1);
4343}
4344
4345// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4346// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4348 bool Invert) {
4349 assert(Glue.getResNo() == 1);
4350 SDLoc DL(Glue);
4351 SDValue Zero = DAG.getConstant(0, DL, VT);
4352 SDValue One = DAG.getConstant(1, DL, VT);
4353 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4354 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4355 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4356}
4357
4358// Value is 1 if 'V' bit of NZCV is 1, else 0
4360 assert(Glue.getResNo() == 1);
4361 SDLoc DL(Glue);
4362 SDValue Zero = DAG.getConstant(0, DL, VT);
4363 SDValue One = DAG.getConstant(1, DL, VT);
4364 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4365 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4366}
4367
4368// This lowering is inefficient, but it will get cleaned up by
4369// `foldOverflowCheck`
4371 unsigned Opcode, bool IsSigned) {
4372 EVT VT0 = Op.getValue(0).getValueType();
4373 EVT VT1 = Op.getValue(1).getValueType();
4374
4375 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4376 return SDValue();
4377
4378 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4379 SDValue OpLHS = Op.getOperand(0);
4380 SDValue OpRHS = Op.getOperand(1);
4381 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4382
4383 SDLoc DL(Op);
4384 SDVTList VTs = DAG.getVTList(VT0, VT1);
4385
4386 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4387 OpRHS, OpCarryIn);
4388
4389 SDValue OutFlag =
4390 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4391 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4392
4393 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4394}
4395
4397 // Let legalize expand this if it isn't a legal type yet.
4398 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4399 return SDValue();
4400
4401 SDLoc dl(Op);
4403 // The actual operation that sets the overflow or carry flag.
4404 SDValue Value, Overflow;
4405 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4406
4407 // We use 0 and 1 as false and true values.
4408 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4409 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4410
4411 // We use an inverted condition, because the conditional select is inverted
4412 // too. This will allow it to be selected to a single instruction:
4413 // CSINC Wd, WZR, WZR, invert(cond).
4414 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4415 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4416 CCVal, Overflow);
4417
4418 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4419 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4420}
4421
4422// Prefetch operands are:
4423// 1: Address to prefetch
4424// 2: bool isWrite
4425// 3: int locality (0 = no locality ... 3 = extreme locality)
4426// 4: bool isDataCache
4428 SDLoc DL(Op);
4429 unsigned IsWrite = Op.getConstantOperandVal(2);
4430 unsigned Locality = Op.getConstantOperandVal(3);
4431 unsigned IsData = Op.getConstantOperandVal(4);
4432
4433 bool IsStream = !Locality;
4434 // When the locality number is set
4435 if (Locality) {
4436 // The front-end should have filtered out the out-of-range values
4437 assert(Locality <= 3 && "Prefetch locality out-of-range");
4438 // The locality degree is the opposite of the cache speed.
4439 // Put the number the other way around.
4440 // The encoding starts at 0 for level 1
4441 Locality = 3 - Locality;
4442 }
4443
4444 // built the mask value encoding the expected behavior.
4445 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4446 (!IsData << 3) | // IsDataCache bit
4447 (Locality << 1) | // Cache level bits
4448 (unsigned)IsStream; // Stream bit
4449 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4450 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4451 Op.getOperand(1));
4452}
4453
4454// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4455// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4456// (AND X Y) Z which produces a better opt with EmitComparison
4458 SelectionDAG &DAG, const SDLoc dl) {
4459 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4460 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4461 ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4462 if (LHSConstOp && RHSConst) {
4463 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4464 uint64_t RHSConstant = RHSConst->getZExtValue();
4465 if (isPowerOf2_64(RHSConstant)) {
4466 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4467 LHS =
4468 DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0),
4469 DAG.getConstant(NewMaskValue, dl, LHS.getValueType()));
4470 RHS = DAG.getConstant(0, dl, RHS.getValueType());
4471 CC = ISD::SETEQ;
4472 }
4473 }
4474 }
4475}
4476
4477SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4478 SelectionDAG &DAG) const {
4479 EVT VT = Op.getValueType();
4480 if (VT.isScalableVector()) {
4481 SDValue SrcVal = Op.getOperand(0);
4482
4483 if (SrcVal.getValueType().getScalarType() == MVT::bf16) {
4484 // bf16 and f32 share the same exponent range so the conversion requires
4485 // them to be aligned with the new mantissa bits zero'd. This is just a
4486 // left shift that is best to isel directly.
4487 if (VT == MVT::nxv2f32 || VT == MVT::nxv4f32)
4488 return Op;
4489
4490 if (VT != MVT::nxv2f64)
4491 return SDValue();
4492
4493 // Break other conversions in two with the first part converting to f32
4494 // and the second using native f32->VT instructions.
4495 SDLoc DL(Op);
4496 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4497 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4498 }
4499
4500 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4501 }
4502
4503 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4504 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4505
4506 bool IsStrict = Op->isStrictFPOpcode();
4507 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4508 EVT Op0VT = Op0.getValueType();
4509 if (VT == MVT::f64) {
4510 // FP16->FP32 extends are legal for v32 and v4f32.
4511 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4512 return Op;
4513 // Split bf16->f64 extends into two fpextends.
4514 if (Op0VT == MVT::bf16 && IsStrict) {
4515 SDValue Ext1 =
4516 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4517 {Op0, Op.getOperand(0)});
4518 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4519 {Ext1, Ext1.getValue(1)});
4520 }
4521 if (Op0VT == MVT::bf16)
4522 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4523 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4524 return SDValue();
4525 }
4526
4527 if (VT.getScalarType() == MVT::f32) {
4528 // FP16->FP32 extends are legal for v32 and v4f32.
4529 if (Op0VT.getScalarType() == MVT::f16)
4530 return Op;
4531 if (Op0VT.getScalarType() == MVT::bf16) {
4532 SDLoc DL(Op);
4533 EVT IVT = VT.changeTypeToInteger();
4534 if (!Op0VT.isVector()) {
4535 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
4536 IVT = MVT::v4i32;
4537 }
4538
4539 EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
4540 SDValue Ext =
4541 DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
4542 SDValue Shift =
4543 DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
4544 if (!Op0VT.isVector())
4545 Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
4546 DAG.getConstant(0, DL, MVT::i64));
4547 Shift = DAG.getBitcast(VT, Shift);
4548 return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
4549 : Shift;
4550 }
4551 return SDValue();
4552 }
4553
4554 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4555 return SDValue();
4556}
4557
4558SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4559 SelectionDAG &DAG) const {
4560 EVT VT = Op.getValueType();
4561 bool IsStrict = Op->isStrictFPOpcode();
4562 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4563 EVT SrcVT = SrcVal.getValueType();
4564 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4565
4566 if (VT.isScalableVector()) {
4567 if (VT.getScalarType() != MVT::bf16)
4568 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4569
4570 SDLoc DL(Op);
4571 constexpr EVT I32 = MVT::nxv4i32;
4572 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4573
4574 SDValue NaN;
4575 SDValue Narrow;
4576
4577 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4578 if (Subtarget->hasBF16())
4579 return LowerToPredicatedOp(Op, DAG,
4581
4582 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4583
4584 // Set the quiet bit.
4585 if (!DAG.isKnownNeverSNaN(SrcVal))
4586 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4587 } else if (SrcVT == MVT::nxv2f64 &&
4588 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4589 // Round to float without introducing rounding errors and try again.
4590 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4591 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4592 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4593
4595 if (IsStrict)
4596 NewOps.push_back(Op.getOperand(0));
4597 NewOps.push_back(Narrow);
4598 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4599 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4600 } else
4601 return SDValue();
4602
4603 if (!Trunc) {
4604 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4605 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4606 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4607 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4608 }
4609
4610 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4611 // 0x80000000.
4612 if (NaN) {
4613 EVT I1 = I32.changeElementType(MVT::i1);
4614 EVT CondVT = VT.changeElementType(MVT::i1);
4615 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4616 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4617 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4618 }
4619
4620 // Now that we have rounded, shift the bits into position.
4621 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4622 return getSVESafeBitCast(VT, Narrow, DAG);
4623 }
4624
4625 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4626 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4627
4628 // Expand cases where the result type is BF16 but we don't have hardware
4629 // instructions to lower it.
4630 if (VT.getScalarType() == MVT::bf16 &&
4631 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4632 Subtarget->hasBF16())) {
4633 SDLoc dl(Op);
4634 SDValue Narrow = SrcVal;
4635 SDValue NaN;
4636 EVT I32 = SrcVT.changeElementType(MVT::i32);
4637 EVT F32 = SrcVT.changeElementType(MVT::f32);
4638 if (SrcVT.getScalarType() == MVT::f32) {
4639 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4640 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4641 if (!NeverSNaN) {
4642 // Set the quiet bit.
4643 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4644 DAG.getConstant(0x400000, dl, I32));
4645 }
4646 } else if (SrcVT.getScalarType() == MVT::f64) {
4647 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4648 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4649 } else {
4650 return SDValue();
4651 }
4652 if (!Trunc) {
4653 SDValue One = DAG.getConstant(1, dl, I32);
4654 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4655 DAG.getShiftAmountConstant(16, I32, dl));
4656 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4657 SDValue RoundingBias =
4658 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4659 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4660 }
4661
4662 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4663 // 0x80000000.
4664 if (NaN) {
4665 SDValue IsNaN = DAG.getSetCC(
4666 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4667 SrcVal, SrcVal, ISD::SETUO);
4668 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4669 }
4670
4671 // Now that we have rounded, shift the bits into position.
4672 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4673 DAG.getShiftAmountConstant(16, I32, dl));
4674 if (VT.isVector()) {
4675 EVT I16 = I32.changeVectorElementType(MVT::i16);
4676 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4677 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4678 }
4679 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4680 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4681 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4682 : Result;
4683 }
4684
4685 if (SrcVT != MVT::f128) {
4686 // Expand cases where the input is a vector bigger than NEON.
4688 return SDValue();
4689
4690 // It's legal except when f128 is involved
4691 return Op;
4692 }
4693
4694 return SDValue();
4695}
4696
4697SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4698 SelectionDAG &DAG) const {
4699 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4700 // Any additional optimization in this function should be recorded
4701 // in the cost tables.
4702 bool IsStrict = Op->isStrictFPOpcode();
4703 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4704 EVT VT = Op.getValueType();
4705
4706 if (VT.isScalableVector()) {
4707 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4710 return LowerToPredicatedOp(Op, DAG, Opcode);
4711 }
4712
4713 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4714 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4715 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4716
4717 unsigned NumElts = InVT.getVectorNumElements();
4718
4719 // f16 conversions are promoted to f32 when full fp16 is not supported.
4720 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4721 InVT.getVectorElementType() == MVT::bf16) {
4722 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4723 SDLoc dl(Op);
4724 if (IsStrict) {
4725 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4726 {Op.getOperand(0), Op.getOperand(1)});
4727 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4728 {Ext.getValue(1), Ext.getValue(0)});
4729 }
4730 return DAG.getNode(
4731 Op.getOpcode(), dl, Op.getValueType(),
4732 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4733 }
4734
4735 uint64_t VTSize = VT.getFixedSizeInBits();
4736 uint64_t InVTSize = InVT.getFixedSizeInBits();
4737 if (VTSize < InVTSize) {
4738 SDLoc dl(Op);
4739 if (IsStrict) {
4741 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4742 {Op.getOperand(0), Op.getOperand(1)});
4743 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4744 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4745 }
4746 SDValue Cv =
4747 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4748 Op.getOperand(0));
4749 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4750 }
4751
4752 if (VTSize > InVTSize) {
4753 SDLoc dl(Op);
4754 MVT ExtVT =
4757 if (IsStrict) {
4758 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4759 {Op.getOperand(0), Op.getOperand(1)});
4760 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4761 {Ext.getValue(1), Ext.getValue(0)});
4762 }
4763 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4764 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4765 }
4766
4767 // Use a scalar operation for conversions between single-element vectors of
4768 // the same size.
4769 if (NumElts == 1) {
4770 SDLoc dl(Op);
4771 SDValue Extract = DAG.getNode(
4773 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4774 EVT ScalarVT = VT.getScalarType();
4775 if (IsStrict)
4776 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4777 {Op.getOperand(0), Extract});
4778 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4779 }
4780
4781 // Type changing conversions are illegal.
4782 return Op;
4783}
4784
4785SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4786 SelectionDAG &DAG) const {
4787 bool IsStrict = Op->isStrictFPOpcode();
4788 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4789
4790 if (SrcVal.getValueType().isVector())
4791 return LowerVectorFP_TO_INT(Op, DAG);
4792
4793 // f16 conversions are promoted to f32 when full fp16 is not supported.
4794 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4795 SrcVal.getValueType() == MVT::bf16) {
4796 SDLoc dl(Op);
4797 if (IsStrict) {
4798 SDValue Ext =
4799 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4800 {Op.getOperand(0), SrcVal});
4801 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4802 {Ext.getValue(1), Ext.getValue(0)});
4803 }
4804 return DAG.getNode(
4805 Op.getOpcode(), dl, Op.getValueType(),
4806 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4807 }
4808
4809 if (SrcVal.getValueType() != MVT::f128) {
4810 // It's legal except when f128 is involved
4811 return Op;
4812 }
4813
4814 return SDValue();
4815}
4816
4817SDValue
4818AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4819 SelectionDAG &DAG) const {
4820 // AArch64 FP-to-int conversions saturate to the destination element size, so
4821 // we can lower common saturating conversions to simple instructions.
4822 SDValue SrcVal = Op.getOperand(0);
4823 EVT SrcVT = SrcVal.getValueType();
4824 EVT DstVT = Op.getValueType();
4825 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4826
4827 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4828 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4829 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4830 assert(SatWidth <= DstElementWidth &&
4831 "Saturation width cannot exceed result width");
4832
4833 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4834 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4835 // types, so this is hard to reach.
4836 if (DstVT.isScalableVector())
4837 return SDValue();
4838
4839 EVT SrcElementVT = SrcVT.getVectorElementType();
4840
4841 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4842 SDLoc DL(Op);
4843 SDValue SrcVal2;
4844 if ((SrcElementVT == MVT::f16 &&
4845 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4846 SrcElementVT == MVT::bf16) {
4847 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4848 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4849 // If we are extending to a v8f32, split into two v4f32 to produce legal
4850 // types.
4851 if (F32VT.getSizeInBits() > 128) {
4852 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4853 F32VT = F32VT.getHalfNumVectorElementsVT();
4854 }
4855 SrcVT = F32VT;
4856 SrcElementVT = MVT::f32;
4857 SrcElementWidth = 32;
4858 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4859 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4860 return SDValue();
4861
4862 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4863 // width and produce a fcvtzu.
4864 if (SatWidth == 64 && SrcElementWidth < 64) {
4865 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4866 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4867 SrcVT = F64VT;
4868 SrcElementVT = MVT::f64;
4869 SrcElementWidth = 64;
4870 }
4871 // Cases that we can emit directly.
4872 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4873 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4874 DAG.getValueType(DstVT.getScalarType()));
4875 if (SrcVal2) {
4876 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4877 DAG.getValueType(DstVT.getScalarType()));
4878 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4879 }
4880 return Res;
4881 }
4882
4883 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4884 // result. This is only valid if the legal cvt is larger than the saturate
4885 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4886 // (at least until sqxtn is selected).
4887 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4888 return SDValue();
4889
4890 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4891 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4892 DAG.getValueType(IntVT.getScalarType()));
4893 SDValue NativeCvt2 =
4894 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4895 DAG.getValueType(IntVT.getScalarType()))
4896 : SDValue();
4897 SDValue Sat, Sat2;
4898 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4899 SDValue MinC = DAG.getConstant(
4900 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4901 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4902 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4903 SDValue MaxC = DAG.getConstant(
4904 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4905 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4906 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4907 } else {
4908 SDValue MinC = DAG.getConstant(
4909 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4910 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4911 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4912 }
4913
4914 if (SrcVal2)
4915 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4917 Sat, Sat2);
4918
4919 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4920}
4921
4922SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4923 SelectionDAG &DAG) const {
4924 // AArch64 FP-to-int conversions saturate to the destination register size, so
4925 // we can lower common saturating conversions to simple instructions.
4926 SDValue SrcVal = Op.getOperand(0);
4927 EVT SrcVT = SrcVal.getValueType();
4928
4929 if (SrcVT.isVector())
4930 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4931
4932 EVT DstVT = Op.getValueType();
4933 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4934 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4935 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4936 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4937
4938 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4939 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4940 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4941 SrcVT = MVT::f32;
4942 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4943 SrcVT != MVT::bf16)
4944 return SDValue();
4945
4946 SDLoc DL(Op);
4947 // Cases that we can emit directly.
4948 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4949 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4950 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4951 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4952 DAG.getValueType(DstVT));
4953
4954 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4955 // result. This is only valid if the legal cvt is larger than the saturate
4956 // width.
4957 if (DstWidth < SatWidth)
4958 return SDValue();
4959
4960 SDValue NativeCvt =
4961 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4962 SDValue Sat;
4963 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4964 SDValue MinC = DAG.getConstant(
4965 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4966 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4967 SDValue MaxC = DAG.getConstant(
4968 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4969 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4970 } else {
4971 SDValue MinC = DAG.getConstant(
4972 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4973 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4974 }
4975
4976 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4977}
4978
4979SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4980 SelectionDAG &DAG) const {
4981 EVT VT = Op.getValueType();
4982 SDValue Src = Op.getOperand(0);
4983 SDLoc DL(Op);
4984
4985 assert(VT.isVector() && "Expected vector type");
4986
4987 EVT CastVT =
4988 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4989
4990 // Round the floating-point value into a floating-point register with the
4991 // current rounding mode.
4992 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4993
4994 // Truncate the rounded floating point to an integer.
4995 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4997}
4998
4999SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5000 SelectionDAG &DAG) const {
5001 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5002 // Any additional optimization in this function should be recorded
5003 // in the cost tables.
5004 bool IsStrict = Op->isStrictFPOpcode();
5005 EVT VT = Op.getValueType();
5006 SDLoc dl(Op);
5007 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5008 EVT InVT = In.getValueType();
5009 unsigned Opc = Op.getOpcode();
5010 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5011
5012 if (VT.isScalableVector()) {
5013 if (InVT.getVectorElementType() == MVT::i1) {
5014 // We can't directly extend an SVE predicate; extend it first.
5015 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5016 EVT CastVT = getPromotedVTForPredicate(InVT);
5017 In = DAG.getNode(CastOpc, dl, CastVT, In);
5018 return DAG.getNode(Opc, dl, VT, In);
5019 }
5020
5021 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5023 return LowerToPredicatedOp(Op, DAG, Opcode);
5024 }
5025
5026 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5027 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5028 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5029
5030 // Promote bf16 conversions to f32.
5031 if (VT.getVectorElementType() == MVT::bf16) {
5032 EVT F32 = VT.changeElementType(MVT::f32);
5033 if (IsStrict) {
5034 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
5035 {Op.getOperand(0), In});
5036 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5037 {Op.getValueType(), MVT::Other},
5038 {Val.getValue(1), Val.getValue(0),
5039 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5040 }
5041 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5042 DAG.getNode(Op.getOpcode(), dl, F32, In),
5043 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5044 }
5045
5046 uint64_t VTSize = VT.getFixedSizeInBits();
5047 uint64_t InVTSize = InVT.getFixedSizeInBits();
5048 if (VTSize < InVTSize) {
5049 MVT CastVT =
5051 InVT.getVectorNumElements());
5052 if (IsStrict) {
5053 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
5054 {Op.getOperand(0), In});
5055 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
5056 {In.getValue(1), In.getValue(0),
5057 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5058 }
5059 In = DAG.getNode(Opc, dl, CastVT, In);
5060 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
5061 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5062 }
5063
5064 if (VTSize > InVTSize) {
5065 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5067 In = DAG.getNode(CastOpc, dl, CastVT, In);
5068 if (IsStrict)
5069 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
5070 return DAG.getNode(Opc, dl, VT, In);
5071 }
5072
5073 // Use a scalar operation for conversions between single-element vectors of
5074 // the same size.
5075 if (VT.getVectorNumElements() == 1) {
5076 SDValue Extract = DAG.getNode(
5078 In, DAG.getConstant(0, dl, MVT::i64));
5079 EVT ScalarVT = VT.getScalarType();
5080 if (IsStrict)
5081 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
5082 {Op.getOperand(0), Extract});
5083 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
5084 }
5085
5086 return Op;
5087}
5088
5089SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5090 SelectionDAG &DAG) const {
5091 if (Op.getValueType().isVector())
5092 return LowerVectorINT_TO_FP(Op, DAG);
5093
5094 bool IsStrict = Op->isStrictFPOpcode();
5095 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5096
5097 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5098 Op->getOpcode() == ISD::SINT_TO_FP;
5099
5100 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5101 SDLoc dl(Op);
5102 if (IsStrict) {
5103 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
5104 {Op.getOperand(0), SrcVal});
5105 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5106 {Op.getValueType(), MVT::Other},
5107 {Val.getValue(1), Val.getValue(0),
5108 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5109 }
5110 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5111 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
5112 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5113 };
5114
5115 if (Op.getValueType() == MVT::bf16) {
5116 unsigned MaxWidth = IsSigned
5117 ? DAG.ComputeMaxSignificantBits(SrcVal)
5118 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5119 // bf16 conversions are promoted to f32 when converting from i16.
5120 if (MaxWidth <= 24) {
5121 return IntToFpViaPromotion(MVT::f32);
5122 }
5123
5124 // bf16 conversions are promoted to f64 when converting from i32.
5125 if (MaxWidth <= 53) {
5126 return IntToFpViaPromotion(MVT::f64);
5127 }
5128
5129 // We need to be careful about i64 -> bf16.
5130 // Consider an i32 22216703.
5131 // This number cannot be represented exactly as an f32 and so a itofp will
5132 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5133 // However, the correct bf16 was supposed to be 22151168.0
5134 // We need to use sticky rounding to get this correct.
5135 if (SrcVal.getValueType() == MVT::i64) {
5136 SDLoc DL(Op);
5137 // This algorithm is equivalent to the following:
5138 // uint64_t SrcHi = SrcVal & ~0xfffull;
5139 // uint64_t SrcLo = SrcVal & 0xfffull;
5140 // uint64_t Highest = SrcVal >> 53;
5141 // bool HasHighest = Highest != 0;
5142 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5143 // double Rounded = static_cast<double>(ToRound);
5144 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5145 // uint64_t HasLo = SrcLo != 0;
5146 // bool NeedsAdjustment = HasHighest & HasLo;
5147 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5148 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5149 // return static_cast<__bf16>(Adjusted);
5150 //
5151 // Essentially, what happens is that SrcVal either fits perfectly in a
5152 // double-precision value or it is too big. If it is sufficiently small,
5153 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5154 // ensure that u64 -> double has no rounding error by only using the 52
5155 // MSB of the input. The low order bits will get merged into a sticky bit
5156 // which will avoid issues incurred by double rounding.
5157
5158 // Signed conversion is more or less like so:
5159 // copysign((__bf16)abs(SrcVal), SrcVal)
5160 SDValue SignBit;
5161 if (IsSigned) {
5162 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5163 DAG.getConstant(1ull << 63, DL, MVT::i64));
5164 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5165 }
5166 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5167 DAG.getConstant(~0xfffull, DL, MVT::i64));
5168 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5169 DAG.getConstant(0xfffull, DL, MVT::i64));
5171 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5172 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5173 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5174 SDValue ToRound =
5175 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5176 SDValue Rounded =
5177 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5178 {Op.getOperand(0), ToRound})
5179 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5180
5181 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5182 if (SignBit) {
5183 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5184 }
5185
5186 SDValue HasHighest = DAG.getSetCC(
5187 DL,
5188 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5189 Highest, Zero64, ISD::SETNE);
5190
5191 SDValue HasLo = DAG.getSetCC(
5192 DL,
5193 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5194 SrcLo, Zero64, ISD::SETNE);
5195
5196 SDValue NeedsAdjustment =
5197 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5198 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5199
5200 SDValue AdjustedBits =
5201 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5202 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5203 return IsStrict
5204 ? DAG.getNode(
5206 {Op.getValueType(), MVT::Other},
5207 {Rounded.getValue(1), Adjusted,
5208 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5209 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5210 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5211 }
5212 }
5213
5214 // f16 conversions are promoted to f32 when full fp16 is not supported.
5215 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5216 return IntToFpViaPromotion(MVT::f32);
5217 }
5218
5219 // i128 conversions are libcalls.
5220 if (SrcVal.getValueType() == MVT::i128)
5221 return SDValue();
5222
5223 // Other conversions are legal, unless it's to the completely software-based
5224 // fp128.
5225 if (Op.getValueType() != MVT::f128)
5226 return Op;
5227 return SDValue();
5228}
5229
5230SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5231 SelectionDAG &DAG) const {
5232 // For iOS, we want to call an alternative entry point: __sincos_stret,
5233 // which returns the values in two S / D registers.
5234 SDLoc dl(Op);
5235 SDValue Arg = Op.getOperand(0);
5236 EVT ArgVT = Arg.getValueType();
5237 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5238
5240 ArgListEntry Entry;
5241
5242 Entry.Node = Arg;
5243 Entry.Ty = ArgTy;
5244 Entry.IsSExt = false;
5245 Entry.IsZExt = false;
5246 Args.push_back(Entry);
5247
5248 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5249 : RTLIB::SINCOS_STRET_F32;
5250 const char *LibcallName = getLibcallName(LC);
5251 SDValue Callee =
5252 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5253
5254 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5256 CLI.setDebugLoc(dl)
5257 .setChain(DAG.getEntryNode())
5258 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
5259
5260 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5261 return CallResult.first;
5262}
5263
5264static MVT getSVEContainerType(EVT ContentTy);
5265
5266SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5267 SelectionDAG &DAG) const {
5268 EVT OpVT = Op.getValueType();
5269 EVT ArgVT = Op.getOperand(0).getValueType();
5270
5272 return LowerFixedLengthBitcastToSVE(Op, DAG);
5273
5274 if (OpVT.isScalableVector()) {
5275 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5276
5277 // Handle type legalisation first.
5278 if (!isTypeLegal(ArgVT)) {
5279 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5280 "Expected int->fp bitcast!");
5281
5282 // Bitcasting between unpacked vector types of different element counts is
5283 // not a NOP because the live elements are laid out differently.
5284 // 01234567
5285 // e.g. nxv2i32 = XX??XX??
5286 // nxv4f16 = X?X?X?X?
5287 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5288 return SDValue();
5289
5290 SDValue ExtResult =
5292 Op.getOperand(0));
5293 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5294 }
5295
5296 // Bitcasts between legal types with the same element count are legal.
5297 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5298 return Op;
5299
5300 // getSVESafeBitCast does not support casting between unpacked types.
5301 if (!isPackedVectorType(OpVT, DAG))
5302 return SDValue();
5303
5304 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5305 }
5306
5307 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5308 return SDValue();
5309
5310 // Bitcasts between f16 and bf16 are legal.
5311 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5312 return Op;
5313
5314 assert(ArgVT == MVT::i16);
5315 SDLoc DL(Op);
5316
5317 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5318 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5319 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5320}
5321
5322// Returns lane if Op extracts from a two-element vector and lane is constant
5323// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5324static std::optional<uint64_t>
5326 SDNode *OpNode = Op.getNode();
5327 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5328 return std::nullopt;
5329
5330 EVT VT = OpNode->getOperand(0).getValueType();
5331 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
5332 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5333 return std::nullopt;
5334
5335 return C->getZExtValue();
5336}
5337
5339 bool isSigned) {
5340 EVT VT = N.getValueType();
5341
5342 if (N.getOpcode() != ISD::BUILD_VECTOR)
5343 return false;
5344
5345 for (const SDValue &Elt : N->op_values()) {
5346 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5347 unsigned EltSize = VT.getScalarSizeInBits();
5348 unsigned HalfSize = EltSize / 2;
5349 if (isSigned) {
5350 if (!isIntN(HalfSize, C->getSExtValue()))
5351 return false;
5352 } else {
5353 if (!isUIntN(HalfSize, C->getZExtValue()))
5354 return false;
5355 }
5356 continue;
5357 }
5358 return false;
5359 }
5360
5361 return true;
5362}
5363
5365 EVT VT = N.getValueType();
5366 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5367 EVT HalfVT = EVT::getVectorVT(
5368 *DAG.getContext(),
5371 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5372}
5373
5375 return N.getOpcode() == ISD::SIGN_EXTEND ||
5376 N.getOpcode() == ISD::ANY_EXTEND ||
5377 isExtendedBUILD_VECTOR(N, DAG, true);
5378}
5379
5381 return N.getOpcode() == ISD::ZERO_EXTEND ||
5382 N.getOpcode() == ISD::ANY_EXTEND ||
5383 isExtendedBUILD_VECTOR(N, DAG, false);
5384}
5385
5387 unsigned Opcode = N.getOpcode();
5388 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5389 SDValue N0 = N.getOperand(0);
5390 SDValue N1 = N.getOperand(1);
5391 return N0->hasOneUse() && N1->hasOneUse() &&
5392 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5393 }
5394 return false;
5395}
5396
5398 unsigned Opcode = N.getOpcode();
5399 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5400 SDValue N0 = N.getOperand(0);
5401 SDValue N1 = N.getOperand(1);
5402 return N0->hasOneUse() && N1->hasOneUse() &&
5403 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5404 }
5405 return false;
5406}
5407
5408SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5409 SelectionDAG &DAG) const {
5410 // The rounding mode is in bits 23:22 of the FPSCR.
5411 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5412 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5413 // so that the shift + and get folded into a bitfield extract.
5414 SDLoc dl(Op);
5415
5416 SDValue Chain = Op.getOperand(0);
5417 SDValue FPCR_64 = DAG.getNode(
5418 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5419 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5420 Chain = FPCR_64.getValue(1);
5421 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5422 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5423 DAG.getConstant(1U << 22, dl, MVT::i32));
5424 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5425 DAG.getConstant(22, dl, MVT::i32));
5426 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5427 DAG.getConstant(3, dl, MVT::i32));
5428 return DAG.getMergeValues({AND, Chain}, dl);
5429}
5430
5431SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5432 SelectionDAG &DAG) const {
5433 SDLoc DL(Op);
5434 SDValue Chain = Op->getOperand(0);
5435 SDValue RMValue = Op->getOperand(1);
5436
5437 // The rounding mode is in bits 23:22 of the FPCR.
5438 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5439 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5440 // ((arg - 1) & 3) << 22).
5441 //
5442 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5443 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5444 // generated llvm.set.rounding to ensure this condition.
5445
5446 // Calculate new value of FPCR[23:22].
5447 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5448 DAG.getConstant(1, DL, MVT::i32));
5449 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5450 DAG.getConstant(0x3, DL, MVT::i32));
5451 RMValue =
5452 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5453 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5454 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5455
5456 // Get current value of FPCR.
5457 SDValue Ops[] = {
5458 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5459 SDValue FPCR =
5460 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5461 Chain = FPCR.getValue(1);
5462 FPCR = FPCR.getValue(0);
5463
5464 // Put new rounding mode into FPSCR[23:22].
5465 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5466 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5467 DAG.getConstant(RMMask, DL, MVT::i64));
5468 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5469 SDValue Ops2[] = {
5470 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5471 FPCR};
5472 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5473}
5474
5475SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5476 SelectionDAG &DAG) const {
5477 SDLoc DL(Op);
5478 SDValue Chain = Op->getOperand(0);
5479
5480 // Get current value of FPCR.
5481 SDValue Ops[] = {
5482 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5483 SDValue FPCR =
5484 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5485 Chain = FPCR.getValue(1);
5486 FPCR = FPCR.getValue(0);
5487
5488 // Truncate FPCR to 32 bits.
5489 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5490
5491 return DAG.getMergeValues({Result, Chain}, DL);
5492}
5493
5494SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5495 SelectionDAG &DAG) const {
5496 SDLoc DL(Op);
5497 SDValue Chain = Op->getOperand(0);
5498 SDValue Mode = Op->getOperand(1);
5499
5500 // Extend the specified value to 64 bits.
5501 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5502
5503 // Set new value of FPCR.
5504 SDValue Ops2[] = {
5505 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5506 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5507}
5508
5509SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5510 SelectionDAG &DAG) const {
5511 SDLoc DL(Op);
5512 SDValue Chain = Op->getOperand(0);
5513
5514 // Get current value of FPCR.
5515 SDValue Ops[] = {
5516 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5517 SDValue FPCR =
5518 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5519 Chain = FPCR.getValue(1);
5520 FPCR = FPCR.getValue(0);
5521
5522 // Clear bits that are not reserved.
5523 SDValue FPSCRMasked = DAG.getNode(
5524 ISD::AND, DL, MVT::i64, FPCR,
5526
5527 // Set new value of FPCR.
5528 SDValue Ops2[] = {Chain,
5529 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5530 FPSCRMasked};
5531 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5532}
5533
5534static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5535 SDLoc DL, bool &IsMLA) {
5536 bool IsN0SExt = isSignExtended(N0, DAG);
5537 bool IsN1SExt = isSignExtended(N1, DAG);
5538 if (IsN0SExt && IsN1SExt)
5539 return AArch64ISD::SMULL;
5540
5541 bool IsN0ZExt = isZeroExtended(N0, DAG);
5542 bool IsN1ZExt = isZeroExtended(N1, DAG);
5543
5544 if (IsN0ZExt && IsN1ZExt)
5545 return AArch64ISD::UMULL;
5546
5547 // Select UMULL if we can replace the other operand with an extend.
5548 EVT VT = N0.getValueType();
5549 unsigned EltSize = VT.getScalarSizeInBits();
5550 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5551 if (IsN0ZExt || IsN1ZExt) {
5552 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5553 return AArch64ISD::UMULL;
5554 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5555 DAG.MaskedValueIsZero(N1, Mask)) {
5556 // For v2i64 we look more aggresively at both operands being zero, to avoid
5557 // scalarization.
5558 return AArch64ISD::UMULL;
5559 }
5560
5561 if (IsN0SExt || IsN1SExt) {
5562 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5563 return AArch64ISD::SMULL;
5564 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5565 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5566 return AArch64ISD::SMULL;
5567 }
5568
5569 if (!IsN1SExt && !IsN1ZExt)
5570 return 0;
5571
5572 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5573 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5574 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5575 IsMLA = true;
5576 return AArch64ISD::SMULL;
5577 }
5578 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5579 IsMLA = true;
5580 return AArch64ISD::UMULL;
5581 }
5582 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5583 std::swap(N0, N1);
5584 IsMLA = true;
5585 return AArch64ISD::UMULL;
5586 }
5587 return 0;
5588}
5589
5590SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5591 EVT VT = Op.getValueType();
5592
5593 bool OverrideNEON = !Subtarget->isNeonAvailable();
5594 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5595 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5596
5597 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5598 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5599 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5600 "unexpected type for custom-lowering ISD::MUL");
5601 SDValue N0 = Op.getOperand(0);
5602 SDValue N1 = Op.getOperand(1);
5603 bool isMLA = false;
5604 EVT OVT = VT;
5605 if (VT.is64BitVector()) {
5606 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5607 isNullConstant(N0.getOperand(1)) &&
5609 isNullConstant(N1.getOperand(1))) {
5610 N0 = N0.getOperand(0);
5611 N1 = N1.getOperand(0);
5612 VT = N0.getValueType();
5613 } else {
5614 if (VT == MVT::v1i64) {
5615 if (Subtarget->hasSVE())
5616 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5617 // Fall through to expand this. It is not legal.
5618 return SDValue();
5619 } else
5620 // Other vector multiplications are legal.
5621 return Op;
5622 }
5623 }
5624
5625 SDLoc DL(Op);
5626 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5627
5628 if (!NewOpc) {
5629 if (VT.getVectorElementType() == MVT::i64) {
5630 // If SVE is available then i64 vector multiplications can also be made
5631 // legal.
5632 if (Subtarget->hasSVE())
5633 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5634 // Fall through to expand this. It is not legal.
5635 return SDValue();
5636 } else
5637 // Other vector multiplications are legal.
5638 return Op;
5639 }
5640
5641 // Legalize to a S/UMULL instruction
5642 SDValue Op0;
5643 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5644 if (!isMLA) {
5645 Op0 = skipExtensionForVectorMULL(N0, DAG);
5647 Op1.getValueType().is64BitVector() &&
5648 "unexpected types for extended operands to VMULL");
5649 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5650 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5651 DAG.getConstant(0, DL, MVT::i64));
5652 }
5653 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5654 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5655 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5658 EVT Op1VT = Op1.getValueType();
5659 return DAG.getNode(
5661 DAG.getNode(N0.getOpcode(), DL, VT,
5662 DAG.getNode(NewOpc, DL, VT,
5663 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5664 DAG.getNode(NewOpc, DL, VT,
5665 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5666 DAG.getConstant(0, DL, MVT::i64));
5667}
5668
5669static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5670 int Pattern) {
5671 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5672 return DAG.getConstant(1, DL, MVT::nxv1i1);
5673 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5674 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5675}
5676
5678 bool IsSigned, bool IsEqual) {
5679 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5680 !isa<ConstantSDNode>(Op.getOperand(2)))
5681 return SDValue();
5682
5683 SDLoc dl(Op);
5684 APInt X = Op.getConstantOperandAPInt(1);
5685 APInt Y = Op.getConstantOperandAPInt(2);
5686
5687 // When the second operand is the maximum value, comparisons that include
5688 // equality can never fail and thus we can return an all active predicate.
5689 if (IsEqual)
5690 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5691 return DAG.getConstant(1, dl, Op.getValueType());
5692
5693 bool Overflow;
5694 APInt NumActiveElems =
5695 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5696
5697 if (Overflow)
5698 return SDValue();
5699
5700 if (IsEqual) {
5701 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5702 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5703 : NumActiveElems.uadd_ov(One, Overflow);
5704 if (Overflow)
5705 return SDValue();
5706 }
5707
5708 std::optional<unsigned> PredPattern =
5710 unsigned MinSVEVectorSize = std::max(
5712 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5713 if (PredPattern != std::nullopt &&
5714 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5715 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5716
5717 return SDValue();
5718}
5719
5720// Returns a safe bitcast between two scalable vector predicates, where
5721// any newly created lanes from a widening bitcast are defined as zero.
5723 SDLoc DL(Op);
5724 EVT InVT = Op.getValueType();
5725
5726 assert(InVT.getVectorElementType() == MVT::i1 &&
5727 VT.getVectorElementType() == MVT::i1 &&
5728 "Expected a predicate-to-predicate bitcast");
5730 InVT.isScalableVector() &&
5731 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5732 "Only expect to cast between legal scalable predicate types!");
5733
5734 // Return the operand if the cast isn't changing type,
5735 if (InVT == VT)
5736 return Op;
5737
5738 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5739 // than VT. This will increase the chances of removing casts that introduce
5740 // new lanes, which have to be explicitly zero'd.
5741 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5742 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5743 Op.getOperand(1).getValueType().bitsGT(VT))
5744 Op = Op.getOperand(1);
5745
5746 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5747
5748 // We only have to zero the lanes if new lanes are being defined, e.g. when
5749 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5750 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5751 // we can return here.
5752 if (InVT.bitsGT(VT))
5753 return Reinterpret;
5754
5755 // Check if the other lanes are already known to be zeroed by
5756 // construction.
5758 return Reinterpret;
5759
5760 // Zero the newly introduced lanes.
5761 SDValue Mask = DAG.getConstant(1, DL, InVT);
5762 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5763 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5764}
5765
5766SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5767 SDValue Chain, SDLoc DL,
5768 EVT VT) const {
5769 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5771 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5772 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5775 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5777 RetTy, Callee, std::move(Args));
5778 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5779 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5780 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5781 Mask);
5782}
5783
5784// Lower an SME LDR/STR ZA intrinsic
5785// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5786// folded into the instruction
5787// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5788// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5789// and tile slice registers
5790// ldr(%tileslice, %ptr, %vecnum)
5791// ->
5792// %svl = rdsvl
5793// %ptr2 = %ptr + %svl * %vecnum
5794// %tileslice2 = %tileslice + %vecnum
5795// ldr [%tileslice2, 0], [%ptr2, 0]
5796// Case 3: If the vecnum is an immediate out of range, then the same is done as
5797// case 2, but the base and slice registers are modified by the greatest
5798// multiple of 15 lower than the vecnum and the remainder is folded into the
5799// instruction. This means that successive loads and stores that are offset from
5800// each other can share the same base and slice register updates.
5801// ldr(%tileslice, %ptr, 22)
5802// ldr(%tileslice, %ptr, 23)
5803// ->
5804// %svl = rdsvl
5805// %ptr2 = %ptr + %svl * 15
5806// %tileslice2 = %tileslice + 15
5807// ldr [%tileslice2, 7], [%ptr2, 7]
5808// ldr [%tileslice2, 8], [%ptr2, 8]
5809// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5810// operand and the immediate can be folded into the instruction, like case 2.
5811// ldr(%tileslice, %ptr, %vecnum + 7)
5812// ldr(%tileslice, %ptr, %vecnum + 8)
5813// ->
5814// %svl = rdsvl
5815// %ptr2 = %ptr + %svl * %vecnum
5816// %tileslice2 = %tileslice + %vecnum
5817// ldr [%tileslice2, 7], [%ptr2, 7]
5818// ldr [%tileslice2, 8], [%ptr2, 8]
5819// Case 5: The vecnum being an add of an immediate out of range is also handled,
5820// in which case the same remainder logic as case 3 is used.
5822 SDLoc DL(N);
5823
5824 SDValue TileSlice = N->getOperand(2);
5825 SDValue Base = N->getOperand(3);
5826 SDValue VecNum = N->getOperand(4);
5827 int32_t ConstAddend = 0;
5828 SDValue VarAddend = VecNum;
5829
5830 // If the vnum is an add of an immediate, we can fold it into the instruction
5831 if (VecNum.getOpcode() == ISD::ADD &&
5832 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5833 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5834 VarAddend = VecNum.getOperand(0);
5835 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5836 ConstAddend = ImmNode->getSExtValue();
5837 VarAddend = SDValue();
5838 }
5839
5840 int32_t ImmAddend = ConstAddend % 16;
5841 if (int32_t C = (ConstAddend - ImmAddend)) {
5842 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5843 VarAddend = VarAddend
5844 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5845 : CVal;
5846 }
5847
5848 if (VarAddend) {
5849 // Get the vector length that will be multiplied by vnum
5850 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5851 DAG.getConstant(1, DL, MVT::i32));
5852
5853 // Multiply SVL and vnum then add it to the base
5854 SDValue Mul = DAG.getNode(
5855 ISD::MUL, DL, MVT::i64,
5856 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5857 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5858 // Just add vnum to the tileslice
5859 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5860 }
5861
5863 DL, MVT::Other,
5864 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5865 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5866}
5867
5869 SDLoc dl(Op);
5870 SDValue ID =
5871 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
5872
5873 auto Op1 = Op.getOperand(1);
5874 auto Op2 = Op.getOperand(2);
5875 auto Mask = Op.getOperand(3);
5876
5877 EVT Op1VT = Op1.getValueType();
5878 EVT Op2VT = Op2.getValueType();
5879 EVT ResVT = Op.getValueType();
5880
5881 assert((Op1VT.getVectorElementType() == MVT::i8 ||
5882 Op1VT.getVectorElementType() == MVT::i16) &&
5883 "Expected 8-bit or 16-bit characters.");
5884
5885 // Scalable vector type used to wrap operands.
5886 // A single container is enough for both operands because ultimately the
5887 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5888 EVT OpContainerVT = Op1VT.isScalableVector()
5889 ? Op1VT
5891
5892 if (Op2VT.is128BitVector()) {
5893 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5894 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
5895 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
5896 if (ResVT.isScalableVector())
5897 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
5898 DAG.getTargetConstant(0, dl, MVT::i64));
5899 } else {
5900 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
5901 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5902 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
5903 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
5904 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
5905 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op2IntVT, Op2,
5906 DAG.getConstant(0, dl, MVT::i64));
5907 Op2 = DAG.getSplatVector(Op2PromotedVT, dl, Op2);
5908 Op2 = DAG.getBitcast(OpContainerVT, Op2);
5909 }
5910
5911 // If the result is scalable, we just need to carry out the MATCH.
5912 if (ResVT.isScalableVector())
5913 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1, Op2);
5914
5915 // If the result is fixed, we can still use MATCH but we need to wrap the
5916 // first operand and the mask in scalable vectors before doing so.
5917
5918 // Wrap the operands.
5919 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
5920 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, Op1VT, Mask);
5921 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5922
5923 // Carry out the match.
5924 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Mask.getValueType(),
5925 ID, Mask, Op1, Op2);
5926
5927 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5928 // (v16i8/v8i8).
5929 Match = DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match);
5930 Match = convertFromScalableVector(DAG, Op1VT, Match);
5931 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Match);
5932}
5933
5934SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5935 SelectionDAG &DAG) const {
5936 unsigned IntNo = Op.getConstantOperandVal(1);
5937 SDLoc DL(Op);
5938 switch (IntNo) {
5939 default:
5940 return SDValue(); // Don't custom lower most intrinsics.
5941 case Intrinsic::aarch64_prefetch: {
5942 SDValue Chain = Op.getOperand(0);
5943 SDValue Addr = Op.getOperand(2);
5944
5945 unsigned IsWrite = Op.getConstantOperandVal(3);
5946 unsigned Locality = Op.getConstantOperandVal(4);
5947 unsigned IsStream = Op.getConstantOperandVal(5);
5948 unsigned IsData = Op.getConstantOperandVal(6);
5949 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5950 (!IsData << 3) | // IsDataCache bit
5951 (Locality << 1) | // Cache level bits
5952 (unsigned)IsStream; // Stream bit
5953
5954 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5955 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5956 }
5957 case Intrinsic::aarch64_sme_str:
5958 case Intrinsic::aarch64_sme_ldr: {
5959 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5960 }
5961 case Intrinsic::aarch64_sme_za_enable:
5962 return DAG.getNode(
5963 AArch64ISD::SMSTART, DL, MVT::Other,
5964 Op->getOperand(0), // Chain
5965 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5966 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5967 case Intrinsic::aarch64_sme_za_disable:
5968 return DAG.getNode(
5969 AArch64ISD::SMSTOP, DL, MVT::Other,
5970 Op->getOperand(0), // Chain
5971 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5972 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5973 }
5974}
5975
5976SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5977 SelectionDAG &DAG) const {
5978 unsigned IntNo = Op.getConstantOperandVal(1);
5979 SDLoc DL(Op);
5980 switch (IntNo) {
5981 default:
5982 return SDValue(); // Don't custom lower most intrinsics.
5983 case Intrinsic::aarch64_mops_memset_tag: {
5984 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5985 SDValue Chain = Node->getChain();
5986 SDValue Dst = Op.getOperand(2);
5987 SDValue Val = Op.getOperand(3);
5988 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5989 SDValue Size = Op.getOperand(4);
5990 auto Alignment = Node->getMemOperand()->getAlign();
5991 bool IsVol = Node->isVolatile();
5992 auto DstPtrInfo = Node->getPointerInfo();
5993
5994 const auto &SDI =
5995 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5996 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
5997 Chain, Dst, Val, Size, Alignment, IsVol,
5998 DstPtrInfo, MachinePointerInfo{});
5999
6000 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6001 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6002 // LowerOperationWrapper will complain that the number of results has
6003 // changed.
6004 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6005 }
6006 }
6007}
6008
6009SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6010 SelectionDAG &DAG) const {
6011 unsigned IntNo = Op.getConstantOperandVal(0);
6012 SDLoc dl(Op);
6013 switch (IntNo) {
6014 default: return SDValue(); // Don't custom lower most intrinsics.
6015 case Intrinsic::thread_pointer: {
6016 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6017 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
6018 }
6019 case Intrinsic::aarch64_neon_abs: {
6020 EVT Ty = Op.getValueType();
6021 if (Ty == MVT::i64) {
6022 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
6023 Op.getOperand(1));
6024 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
6025 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
6026 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6027 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
6028 } else {
6029 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
6030 }
6031 }
6032 case Intrinsic::aarch64_neon_pmull64: {
6033 SDValue LHS = Op.getOperand(1);
6034 SDValue RHS = Op.getOperand(2);
6035
6036 std::optional<uint64_t> LHSLane =
6038 std::optional<uint64_t> RHSLane =
6040
6041 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6042 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6043
6044 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6045 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6046 // which ISel recognizes better. For example, generate a ldr into d*
6047 // registers as opposed to a GPR load followed by a fmov.
6048 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6049 std::optional<uint64_t> OtherLane,
6050 const SDLoc &dl,
6051 SelectionDAG &DAG) -> SDValue {
6052 // If the operand is an higher half itself, rewrite it to
6053 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6054 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6055 if (NLane && *NLane == 1)
6056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6057 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
6058
6059 // Operand N is not a higher half but the other operand is.
6060 if (OtherLane && *OtherLane == 1) {
6061 // If this operand is a lower half, rewrite it to
6062 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6063 // align lanes of two operands. A roundtrip sequence (to move from lane
6064 // 1 to lane 0) is like this:
6065 // mov x8, v0.d[1]
6066 // fmov d0, x8
6067 if (NLane && *NLane == 0)
6068 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6069 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
6070 N.getOperand(0),
6071 DAG.getConstant(0, dl, MVT::i64)),
6072 DAG.getConstant(1, dl, MVT::i64));
6073
6074 // Otherwise just dup from main to all lanes.
6075 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
6076 }
6077
6078 // Neither operand is an extract of higher half, so codegen may just use
6079 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6080 assert(N.getValueType() == MVT::i64 &&
6081 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6082 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
6083 };
6084
6085 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
6086 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
6087
6088 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
6089 }
6090 case Intrinsic::aarch64_neon_smax:
6091 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
6092 Op.getOperand(1), Op.getOperand(2));
6093 case Intrinsic::aarch64_neon_umax:
6094 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
6095 Op.getOperand(1), Op.getOperand(2));
6096 case Intrinsic::aarch64_neon_smin:
6097 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
6098 Op.getOperand(1), Op.getOperand(2));
6099 case Intrinsic::aarch64_neon_umin:
6100 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
6101 Op.getOperand(1), Op.getOperand(2));
6102 case Intrinsic::aarch64_neon_scalar_sqxtn:
6103 case Intrinsic::aarch64_neon_scalar_sqxtun:
6104 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6105 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6106 if (Op.getValueType() == MVT::i32)
6107 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6108 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
6109 Op.getOperand(0),
6110 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
6111 Op.getOperand(1))));
6112 return SDValue();
6113 }
6114 case Intrinsic::aarch64_neon_sqxtn:
6115 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6116 Op.getOperand(1));
6117 case Intrinsic::aarch64_neon_sqxtun:
6118 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6119 Op.getOperand(1));
6120 case Intrinsic::aarch64_neon_uqxtn:
6121 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6122 Op.getOperand(1));
6123 case Intrinsic::aarch64_neon_sqshrn:
6124 if (Op.getValueType().isVector())
6125 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6126 DAG.getNode(AArch64ISD::VASHR, dl,
6127 Op.getOperand(1).getValueType(),
6128 Op.getOperand(1), Op.getOperand(2)));
6129 return SDValue();
6130 case Intrinsic::aarch64_neon_sqshrun:
6131 if (Op.getValueType().isVector())
6132 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6133 DAG.getNode(AArch64ISD::VASHR, dl,
6134 Op.getOperand(1).getValueType(),
6135 Op.getOperand(1), Op.getOperand(2)));
6136 return SDValue();
6137 case Intrinsic::aarch64_neon_uqshrn:
6138 if (Op.getValueType().isVector())
6139 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6140 DAG.getNode(AArch64ISD::VLSHR, dl,
6141 Op.getOperand(1).getValueType(),
6142 Op.getOperand(1), Op.getOperand(2)));
6143 return SDValue();
6144 case Intrinsic::aarch64_neon_sqrshrn:
6145 if (Op.getValueType().isVector())
6146 return DAG.getNode(
6147 ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6148 DAG.getNode(
6149 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6150 Op.getOperand(1), Op.getOperand(2)));
6151 return SDValue();
6152 case Intrinsic::aarch64_neon_sqrshrun:
6153 if (Op.getValueType().isVector())
6154 return DAG.getNode(
6155 ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6156 DAG.getNode(
6157 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6158 Op.getOperand(1), Op.getOperand(2)));
6159 return SDValue();
6160 case Intrinsic::aarch64_neon_uqrshrn:
6161 if (Op.getValueType().isVector())
6162 return DAG.getNode(
6163 ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6164 DAG.getNode(
6165 AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2)));
6166 return SDValue();
6167 case Intrinsic::aarch64_sve_whilelo:
6168 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6169 /*IsEqual=*/false);
6170 case Intrinsic::aarch64_sve_whilelt:
6171 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6172 /*IsEqual=*/false);
6173 case Intrinsic::aarch64_sve_whilels:
6174 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6175 /*IsEqual=*/true);
6176 case Intrinsic::aarch64_sve_whilele:
6177 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6178 /*IsEqual=*/true);
6179 case Intrinsic::aarch64_sve_sunpkhi:
6180 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
6181 Op.getOperand(1));
6182 case Intrinsic::aarch64_sve_sunpklo:
6183 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
6184 Op.getOperand(1));
6185 case Intrinsic::aarch64_sve_uunpkhi:
6186 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
6187 Op.getOperand(1));
6188 case Intrinsic::aarch64_sve_uunpklo:
6189 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
6190 Op.getOperand(1));
6191 case Intrinsic::aarch64_sve_clasta_n:
6192 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
6193 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6194 case Intrinsic::aarch64_sve_clastb_n:
6195 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
6196 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6197 case Intrinsic::aarch64_sve_lasta:
6198 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
6199 Op.getOperand(1), Op.getOperand(2));
6200 case Intrinsic::aarch64_sve_lastb:
6201 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
6202 Op.getOperand(1), Op.getOperand(2));
6203 case Intrinsic::aarch64_sve_rev:
6204 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
6205 Op.getOperand(1));
6206 case Intrinsic::aarch64_sve_tbl:
6207 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
6208 Op.getOperand(1), Op.getOperand(2));
6209 case Intrinsic::aarch64_sve_trn1:
6210 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
6211 Op.getOperand(1), Op.getOperand(2));
6212 case Intrinsic::aarch64_sve_trn2:
6213 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
6214 Op.getOperand(1), Op.getOperand(2));
6215 case Intrinsic::aarch64_sve_uzp1:
6216 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
6217 Op.getOperand(1), Op.getOperand(2));
6218 case Intrinsic::aarch64_sve_uzp2:
6219 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
6220 Op.getOperand(1), Op.getOperand(2));
6221 case Intrinsic::aarch64_sve_zip1:
6222 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
6223 Op.getOperand(1), Op.getOperand(2));
6224 case Intrinsic::aarch64_sve_zip2:
6225 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
6226 Op.getOperand(1), Op.getOperand(2));
6227 case Intrinsic::aarch64_sve_splice:
6228 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
6229 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6230 case Intrinsic::aarch64_sve_ptrue:
6231 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
6232 case Intrinsic::aarch64_sve_clz:
6233 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
6234 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6235 case Intrinsic::aarch64_sme_cntsb:
6236 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6237 DAG.getConstant(1, dl, MVT::i32));
6238 case Intrinsic::aarch64_sme_cntsh: {
6239 SDValue One = DAG.getConstant(1, dl, MVT::i32);
6240 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
6241 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
6242 }
6243 case Intrinsic::aarch64_sme_cntsw: {
6244 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6245 DAG.getConstant(1, dl, MVT::i32));
6246 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6247 DAG.getConstant(2, dl, MVT::i32));
6248 }
6249 case Intrinsic::aarch64_sme_cntsd: {
6250 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6251 DAG.getConstant(1, dl, MVT::i32));
6252 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6253 DAG.getConstant(3, dl, MVT::i32));
6254 }
6255 case Intrinsic::aarch64_sve_cnt: {
6256 SDValue Data = Op.getOperand(3);
6257 // CTPOP only supports integer operands.
6258 if (Data.getValueType().isFloatingPoint())
6259 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
6260 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
6261 Op.getOperand(2), Data, Op.getOperand(1));
6262 }
6263 case Intrinsic::aarch64_sve_dupq_lane:
6264 return LowerDUPQLane(Op, DAG);
6265 case Intrinsic::aarch64_sve_convert_from_svbool:
6266 if (Op.getValueType() == MVT::aarch64svcount)
6267 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
6268 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6269 case Intrinsic::aarch64_sve_convert_to_svbool:
6270 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6271 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
6272 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6273 case Intrinsic::aarch64_sve_fneg:
6274 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6275 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6276 case Intrinsic::aarch64_sve_frintp:
6277 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
6278 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6279 case Intrinsic::aarch64_sve_frintm:
6280 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
6281 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6282 case Intrinsic::aarch64_sve_frinti:
6283 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6284 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6285 case Intrinsic::aarch64_sve_frintx:
6286 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6287 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6288 case Intrinsic::aarch64_sve_frinta:
6289 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
6290 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6291 case Intrinsic::aarch64_sve_frintn:
6292 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
6293 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6294 case Intrinsic::aarch64_sve_frintz:
6295 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
6296 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6297 case Intrinsic::aarch64_sve_ucvtf:
6299 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6300 Op.getOperand(1));
6301 case Intrinsic::aarch64_sve_scvtf:
6303 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6304 Op.getOperand(1));
6305 case Intrinsic::aarch64_sve_fcvtzu:
6307 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6308 Op.getOperand(1));
6309 case Intrinsic::aarch64_sve_fcvtzs:
6311 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6312 Op.getOperand(1));
6313 case Intrinsic::aarch64_sve_fsqrt:
6314 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
6315 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6316 case Intrinsic::aarch64_sve_frecpx:
6317 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
6318 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6319 case Intrinsic::aarch64_sve_frecpe_x:
6320 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
6321 Op.getOperand(1));
6322 case Intrinsic::aarch64_sve_frecps_x:
6323 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
6324 Op.getOperand(1), Op.getOperand(2));
6325 case Intrinsic::aarch64_sve_frsqrte_x:
6326 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
6327 Op.getOperand(1));
6328 case Intrinsic::aarch64_sve_frsqrts_x:
6329 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
6330 Op.getOperand(1), Op.getOperand(2));
6331 case Intrinsic::aarch64_sve_fabs:
6332 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6333 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6334 case Intrinsic::aarch64_sve_abs:
6335 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6336 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6337 case Intrinsic::aarch64_sve_neg:
6338 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6339 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6340 case Intrinsic::aarch64_sve_insr: {
6341 SDValue Scalar = Op.getOperand(2);
6342 EVT ScalarTy = Scalar.getValueType();
6343 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6344 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
6345
6346 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
6347 Op.getOperand(1), Scalar);
6348 }
6349 case Intrinsic::aarch64_sve_rbit:
6351 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6352 Op.getOperand(1));
6353 case Intrinsic::aarch64_sve_revb:
6354 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
6355 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6356 case Intrinsic::aarch64_sve_revh:
6357 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
6358 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6359 case Intrinsic::aarch64_sve_revw:
6360 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
6361 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6362 case Intrinsic::aarch64_sve_revd:
6363 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
6364 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6365 case Intrinsic::aarch64_sve_sxtb:
6366 return DAG.getNode(
6368 Op.getOperand(2), Op.getOperand(3),
6369 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6370 Op.getOperand(1));
6371 case Intrinsic::aarch64_sve_sxth:
6372 return DAG.getNode(
6374 Op.getOperand(2), Op.getOperand(3),
6375 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6376 Op.getOperand(1));
6377 case Intrinsic::aarch64_sve_sxtw:
6378 return DAG.getNode(
6380 Op.getOperand(2), Op.getOperand(3),
6381 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6382 Op.getOperand(1));
6383 case Intrinsic::aarch64_sve_uxtb:
6384 return DAG.getNode(
6386 Op.getOperand(2), Op.getOperand(3),
6387 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6388 Op.getOperand(1));
6389 case Intrinsic::aarch64_sve_uxth:
6390 return DAG.getNode(
6392 Op.getOperand(2), Op.getOperand(3),
6393 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6394 Op.getOperand(1));
6395 case Intrinsic::aarch64_sve_uxtw:
6396 return DAG.getNode(
6398 Op.getOperand(2), Op.getOperand(3),
6399 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6400 Op.getOperand(1));
6401 case Intrinsic::localaddress: {
6402 const auto &MF = DAG.getMachineFunction();
6403 const auto *RegInfo = Subtarget->getRegisterInfo();
6404 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6405 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
6406 Op.getSimpleValueType());
6407 }
6408
6409 case Intrinsic::eh_recoverfp: {
6410 // FIXME: This needs to be implemented to correctly handle highly aligned
6411 // stack objects. For now we simply return the incoming FP. Refer D53541
6412 // for more details.
6413 SDValue FnOp = Op.getOperand(1);
6414 SDValue IncomingFPOp = Op.getOperand(2);
6415 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6416 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6417 if (!Fn)
6419 "llvm.eh.recoverfp must take a function as the first argument");
6420 return IncomingFPOp;
6421 }
6422
6423 case Intrinsic::aarch64_neon_vsri:
6424 case Intrinsic::aarch64_neon_vsli:
6425 case Intrinsic::aarch64_sve_sri:
6426 case Intrinsic::aarch64_sve_sli: {
6427 EVT Ty = Op.getValueType();
6428
6429 if (!Ty.isVector())
6430 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6431
6432 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6433
6434 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6435 IntNo == Intrinsic::aarch64_sve_sri;
6436 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6437 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
6438 Op.getOperand(3));
6439 }
6440
6441 case Intrinsic::aarch64_neon_srhadd:
6442 case Intrinsic::aarch64_neon_urhadd:
6443 case Intrinsic::aarch64_neon_shadd:
6444 case Intrinsic::aarch64_neon_uhadd: {
6445 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6446 IntNo == Intrinsic::aarch64_neon_shadd);
6447 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6448 IntNo == Intrinsic::aarch64_neon_urhadd);
6449 unsigned Opcode = IsSignedAdd
6450 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6451 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6452 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6453 Op.getOperand(2));
6454 }
6455 case Intrinsic::aarch64_neon_saddlp:
6456 case Intrinsic::aarch64_neon_uaddlp: {
6457 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6460 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
6461 }
6462 case Intrinsic::aarch64_neon_sdot:
6463 case Intrinsic::aarch64_neon_udot:
6464 case Intrinsic::aarch64_sve_sdot:
6465 case Intrinsic::aarch64_sve_udot: {
6466 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6467 IntNo == Intrinsic::aarch64_sve_udot)
6470 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6471 Op.getOperand(2), Op.getOperand(3));
6472 }
6473 case Intrinsic::aarch64_neon_usdot:
6474 case Intrinsic::aarch64_sve_usdot: {
6475 return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
6476 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6477 }
6478 case Intrinsic::get_active_lane_mask: {
6479 SDValue ID =
6480 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
6481
6482 EVT VT = Op.getValueType();
6483 if (VT.isScalableVector())
6484 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6485 Op.getOperand(2));
6486
6487 // We can use the SVE whilelo instruction to lower this intrinsic by
6488 // creating the appropriate sequence of scalable vector operations and
6489 // then extracting a fixed-width subvector from the scalable vector.
6490
6491 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6492 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6493
6494 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6495 Op.getOperand(1), Op.getOperand(2));
6496 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6497 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6498 DAG.getVectorIdxConstant(0, dl));
6499 }
6500 case Intrinsic::aarch64_neon_saddlv:
6501 case Intrinsic::aarch64_neon_uaddlv: {
6502 EVT OpVT = Op.getOperand(1).getValueType();
6503 EVT ResVT = Op.getValueType();
6504 assert(
6505 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6506 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6507 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6508 "Unexpected aarch64_neon_u/saddlv type");
6509 (void)OpVT;
6510 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6511 SDValue ADDLV = DAG.getNode(
6512 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6514 dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6515 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6516 ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6517 ADDLV, DAG.getConstant(0, dl, MVT::i64));
6518 return EXTRACT_VEC_ELT;
6519 }
6520 case Intrinsic::experimental_cttz_elts: {
6521 SDValue CttzOp = Op.getOperand(1);
6522 EVT VT = CttzOp.getValueType();
6523 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6524
6525 if (VT.isFixedLengthVector()) {
6526 // We can use SVE instructions to lower this intrinsic by first creating
6527 // an SVE predicate register mask from the fixed-width vector.
6528 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6529 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6530 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6531 }
6532
6533 SDValue NewCttzElts =
6534 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6535 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6536 }
6537 case Intrinsic::experimental_vector_match: {
6538 return LowerVectorMatch(Op, DAG);
6539 }
6540 }
6541}
6542
6543bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6544 if (VT.getVectorElementType() == MVT::i8 ||
6545 VT.getVectorElementType() == MVT::i16) {
6546 EltTy = MVT::i32;
6547 return true;
6548 }
6549 return false;
6550}
6551
6552bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6553 EVT DataVT) const {
6554 const EVT IndexVT = Extend.getOperand(0).getValueType();
6555 // SVE only supports implicit extension of 32-bit indices.
6556 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6557 return false;
6558
6559 // Indices cannot be smaller than the main data type.
6560 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6561 return false;
6562
6563 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6564 // element container type, which would violate the previous clause.
6565 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6566}
6567
6568bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6569 EVT ExtVT = ExtVal.getValueType();
6570 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6571 return false;
6572
6573 // It may be worth creating extending masked loads if there are multiple
6574 // masked loads using the same predicate. That way we'll end up creating
6575 // extending masked loads that may then get split by the legaliser. This
6576 // results in just one set of predicate unpacks at the start, instead of
6577 // multiple sets of vector unpacks after each load.
6578 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6579 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6580 // Disable extending masked loads for fixed-width for now, since the code
6581 // quality doesn't look great.
6582 if (!ExtVT.isScalableVector())
6583 return false;
6584
6585 unsigned NumExtMaskedLoads = 0;
6586 for (auto *U : Ld->getMask()->users())
6587 if (isa<MaskedLoadSDNode>(U))
6588 NumExtMaskedLoads++;
6589
6590 if (NumExtMaskedLoads <= 1)
6591 return false;
6592 }
6593 }
6594
6595 return true;
6596}
6597
6598unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6599 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6600 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6602 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6604 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6606 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6608 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6610 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6612 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6614 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6616 };
6617 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6618 return AddrModes.find(Key)->second;
6619}
6620
6621unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6622 switch (Opcode) {
6623 default:
6624 llvm_unreachable("unimplemented opcode");
6625 return Opcode;
6640 }
6641}
6642
6643SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6644 SelectionDAG &DAG) const {
6645 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6646
6647 SDLoc DL(Op);
6648 SDValue Chain = MGT->getChain();
6649 SDValue PassThru = MGT->getPassThru();
6650 SDValue Mask = MGT->getMask();
6651 SDValue BasePtr = MGT->getBasePtr();
6652 SDValue Index = MGT->getIndex();
6653 SDValue Scale = MGT->getScale();
6654 EVT VT = Op.getValueType();
6655 EVT MemVT = MGT->getMemoryVT();
6656 ISD::LoadExtType ExtType = MGT->getExtensionType();
6657 ISD::MemIndexType IndexType = MGT->getIndexType();
6658
6659 // SVE supports zero (and so undef) passthrough values only, everything else
6660 // must be handled manually by an explicit select on the load's output.
6661 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6662 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6663 SDValue Load =
6664 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6665 MGT->getMemOperand(), IndexType, ExtType);
6666 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6667 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6668 }
6669
6670 bool IsScaled = MGT->isIndexScaled();
6671 bool IsSigned = MGT->isIndexSigned();
6672
6673 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6674 // must be calculated before hand.
6675 uint64_t ScaleVal = Scale->getAsZExtVal();
6676 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6677 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6678 EVT IndexVT = Index.getValueType();
6679 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6680 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6681 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6682
6683 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6684 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6685 MGT->getMemOperand(), IndexType, ExtType);
6686 }
6687
6688 // Lower fixed length gather to a scalable equivalent.
6689 if (VT.isFixedLengthVector()) {
6690 assert(Subtarget->useSVEForFixedLengthVectors() &&
6691 "Cannot lower when not using SVE for fixed vectors!");
6692
6693 // NOTE: Handle floating-point as if integer then bitcast the result.
6695 MemVT = MemVT.changeVectorElementTypeToInteger();
6696
6697 // Find the smallest integer fixed length vector we can use for the gather.
6698 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6699 if (DataVT.getVectorElementType() == MVT::i64 ||
6700 Index.getValueType().getVectorElementType() == MVT::i64 ||
6701 Mask.getValueType().getVectorElementType() == MVT::i64)
6702 PromotedVT = VT.changeVectorElementType(MVT::i64);
6703
6704 // Promote vector operands except for passthrough, which we know is either
6705 // undef or zero, and thus best constructed directly.
6706 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6707 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6708 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6709
6710 // A promoted result type forces the need for an extending load.
6711 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6712 ExtType = ISD::EXTLOAD;
6713
6714 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6715
6716 // Convert fixed length vector operands to scalable.
6717 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6718 Index = convertToScalableVector(DAG, ContainerVT, Index);
6720 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6721 : DAG.getConstant(0, DL, ContainerVT);
6722
6723 // Emit equivalent scalable vector gather.
6724 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6725 SDValue Load =
6726 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6727 Ops, MGT->getMemOperand(), IndexType, ExtType);
6728
6729 // Extract fixed length data then convert to the required result type.
6730 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6731 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6732 if (VT.isFloatingPoint())
6733 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6734
6735 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6736 }
6737
6738 // Everything else is legal.
6739 return Op;
6740}
6741
6742SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6743 SelectionDAG &DAG) const {
6744 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6745
6746 SDLoc DL(Op);
6747 SDValue Chain = MSC->getChain();
6748 SDValue StoreVal = MSC->getValue();
6749 SDValue Mask = MSC->getMask();
6750 SDValue BasePtr = MSC->getBasePtr();
6751 SDValue Index = MSC->getIndex();
6752 SDValue Scale = MSC->getScale();
6753 EVT VT = StoreVal.getValueType();
6754 EVT MemVT = MSC->getMemoryVT();
6755 ISD::MemIndexType IndexType = MSC->getIndexType();
6756 bool Truncating = MSC->isTruncatingStore();
6757
6758 bool IsScaled = MSC->isIndexScaled();
6759 bool IsSigned = MSC->isIndexSigned();
6760
6761 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6762 // must be calculated before hand.
6763 uint64_t ScaleVal = Scale->getAsZExtVal();
6764 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6765 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6766 EVT IndexVT = Index.getValueType();
6767 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6768 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6769 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6770
6771 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6772 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6773 MSC->getMemOperand(), IndexType, Truncating);
6774 }
6775
6776 // Lower fixed length scatter to a scalable equivalent.
6777 if (VT.isFixedLengthVector()) {
6778 assert(Subtarget->useSVEForFixedLengthVectors() &&
6779 "Cannot lower when not using SVE for fixed vectors!");
6780
6781 // Once bitcast we treat floating-point scatters as if integer.
6782 if (VT.isFloatingPoint()) {
6784 MemVT = MemVT.changeVectorElementTypeToInteger();
6785 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6786 }
6787
6788 // Find the smallest integer fixed length vector we can use for the scatter.
6789 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6790 if (VT.getVectorElementType() == MVT::i64 ||
6791 Index.getValueType().getVectorElementType() == MVT::i64 ||
6792 Mask.getValueType().getVectorElementType() == MVT::i64)
6793 PromotedVT = VT.changeVectorElementType(MVT::i64);
6794
6795 // Promote vector operands.
6796 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6797 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6798 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6799 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6800
6801 // A promoted value type forces the need for a truncating store.
6802 if (PromotedVT != VT)
6803 Truncating = true;
6804
6805 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6806
6807 // Convert fixed length vector operands to scalable.
6808 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6809 Index = convertToScalableVector(DAG, ContainerVT, Index);
6811 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6812
6813 // Emit equivalent scalable vector scatter.
6814 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6815 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6816 MSC->getMemOperand(), IndexType, Truncating);
6817 }
6818
6819 // Everything else is legal.
6820 return Op;
6821}
6822
6823SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6824 SDLoc DL(Op);
6825 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6826 assert(LoadNode && "Expected custom lowering of a masked load node");
6827 EVT VT = Op->getValueType(0);
6828
6829 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6830 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6831
6832 SDValue PassThru = LoadNode->getPassThru();
6833 SDValue Mask = LoadNode->getMask();
6834
6835 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6836 return Op;
6837
6839 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6840 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6841 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6842 LoadNode->getExtensionType());
6843
6844 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6845
6846 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6847}
6848
6849// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6851 EVT VT, EVT MemVT,
6852 SelectionDAG &DAG) {
6853 assert(VT.isVector() && "VT should be a vector type");
6854 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6855
6856 SDValue Value = ST->getValue();
6857
6858 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6859 // the word lane which represent the v4i8 subvector. It optimizes the store
6860 // to:
6861 //
6862 // xtn v0.8b, v0.8h
6863 // str s0, [x0]
6864
6865 SDValue Undef = DAG.getUNDEF(MVT::i16);
6866 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6867 {Undef, Undef, Undef, Undef});
6868
6869 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6870 Value, UndefVec);
6871 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6872
6873 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6874 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6875 Trunc, DAG.getConstant(0, DL, MVT::i64));
6876
6877 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6878 ST->getBasePtr(), ST->getMemOperand());
6879}
6880
6881// Custom lowering for any store, vector or scalar and/or default or with
6882// a truncate operations. Currently only custom lower truncate operation
6883// from vector v4i16 to v4i8 or volatile stores of i128.
6884SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6885 SelectionDAG &DAG) const {
6886 SDLoc Dl(Op);
6887 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6888 assert (StoreNode && "Can only custom lower store nodes");
6889
6890 SDValue Value = StoreNode->getValue();
6891
6892 EVT VT = Value.getValueType();
6893 EVT MemVT = StoreNode->getMemoryVT();
6894
6895 if (VT.isVector()) {
6897 VT,
6898 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6899 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6900
6901 unsigned AS = StoreNode->getAddressSpace();
6902 Align Alignment = StoreNode->getAlign();
6903 if (Alignment < MemVT.getStoreSize() &&
6904 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6905 StoreNode->getMemOperand()->getFlags(),
6906 nullptr)) {
6907 return scalarizeVectorStore(StoreNode, DAG);
6908 }
6909
6910 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6911 MemVT == MVT::v4i8) {
6912 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6913 }
6914 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6915 // the custom lowering, as there are no un-paired non-temporal stores and
6916 // legalization will break up 256 bit inputs.
6918 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6919 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6920 (MemVT.getScalarSizeInBits() == 8u ||
6921 MemVT.getScalarSizeInBits() == 16u ||
6922 MemVT.getScalarSizeInBits() == 32u ||
6923 MemVT.getScalarSizeInBits() == 64u)) {
6924 SDValue Lo =
6927 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6928 SDValue Hi =
6931 StoreNode->getValue(),
6932 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6934 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6935 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6936 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6937 return Result;
6938 }
6939 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6940 return LowerStore128(Op, DAG);
6941 } else if (MemVT == MVT::i64x8) {
6942 SDValue Value = StoreNode->getValue();
6943 assert(Value->getValueType(0) == MVT::i64x8);
6944 SDValue Chain = StoreNode->getChain();
6945 SDValue Base = StoreNode->getBasePtr();
6946 EVT PtrVT = Base.getValueType();
6947 for (unsigned i = 0; i < 8; i++) {
6948 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6949 Value, DAG.getConstant(i, Dl, MVT::i32));
6950 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6951 DAG.getConstant(i * 8, Dl, PtrVT));
6952 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6953 StoreNode->getOriginalAlign());
6954 }
6955 return Chain;
6956 }
6957
6958 return SDValue();
6959}
6960
6961/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6962SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6963 SelectionDAG &DAG) const {
6964 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6965 assert(StoreNode->getMemoryVT() == MVT::i128);
6966 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6967
6968 bool IsStoreRelease =
6970 if (StoreNode->isAtomic())
6971 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6972 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6975
6976 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6977 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6978 ? StoreNode->getOperand(1)
6979 : StoreNode->getOperand(2);
6980 SDLoc DL(Op);
6981 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6982 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6983 if (DAG.getDataLayout().isBigEndian())
6984 std::swap(StoreValue.first, StoreValue.second);
6986 Opcode, DL, DAG.getVTList(MVT::Other),
6987 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6988 StoreNode->getBasePtr()},
6989 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6990 return Result;
6991}
6992
6993SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6994 SelectionDAG &DAG) const {
6995 SDLoc DL(Op);
6996 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6997 assert(LoadNode && "Expected custom lowering of a load node");
6998
6999 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7001 SDValue Base = LoadNode->getBasePtr();
7002 SDValue Chain = LoadNode->getChain();
7003 EVT PtrVT = Base.getValueType();
7004 for (unsigned i = 0; i < 8; i++) {
7005 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7006 DAG.getConstant(i * 8, DL, PtrVT));
7007 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
7008 LoadNode->getPointerInfo(),
7009 LoadNode->getOriginalAlign());
7010 Ops.push_back(Part);
7011 Chain = SDValue(Part.getNode(), 1);
7012 }
7013 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7014 return DAG.getMergeValues({Loaded, Chain}, DL);
7015 }
7016
7017 // Custom lowering for extending v4i8 vector loads.
7018 EVT VT = Op->getValueType(0);
7019 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7020
7021 if (LoadNode->getMemoryVT() != MVT::v4i8)
7022 return SDValue();
7023
7024 // Avoid generating unaligned loads.
7025 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7026 return SDValue();
7027
7028 unsigned ExtType;
7029 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7030 ExtType = ISD::SIGN_EXTEND;
7031 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7032 LoadNode->getExtensionType() == ISD::EXTLOAD)
7033 ExtType = ISD::ZERO_EXTEND;
7034 else
7035 return SDValue();
7036
7037 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7038 LoadNode->getBasePtr(), MachinePointerInfo());
7039 SDValue Chain = Load.getValue(1);
7040 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7041 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7042 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7043 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7044 DAG.getConstant(0, DL, MVT::i64));
7045 if (VT == MVT::v4i32)
7046 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7047 return DAG.getMergeValues({Ext, Chain}, DL);
7048}
7049
7050SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7051 SelectionDAG &DAG) const {
7052 SDLoc DL(Op);
7053 SDValue Vec = Op.getOperand(0);
7054 SDValue Mask = Op.getOperand(1);
7055 SDValue Passthru = Op.getOperand(2);
7056 EVT VecVT = Vec.getValueType();
7057 EVT MaskVT = Mask.getValueType();
7058 EVT ElmtVT = VecVT.getVectorElementType();
7059 const bool IsFixedLength = VecVT.isFixedLengthVector();
7060 const bool HasPassthru = !Passthru.isUndef();
7061 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7062 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7063
7064 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7065
7066 if (!Subtarget->isSVEAvailable())
7067 return SDValue();
7068
7069 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7070 return SDValue();
7071
7072 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7073 if (MinElmts != 2 && MinElmts != 4)
7074 return SDValue();
7075
7076 // We can use the SVE register containing the NEON vector in its lowest bits.
7077 if (IsFixedLength) {
7078 EVT ScalableVecVT =
7079 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7080 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7081 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7082
7083 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7084 DAG.getUNDEF(ScalableVecVT), Vec,
7085 DAG.getConstant(0, DL, MVT::i64));
7086 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7087 DAG.getUNDEF(ScalableMaskVT), Mask,
7088 DAG.getConstant(0, DL, MVT::i64));
7090 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7091 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7092 DAG.getUNDEF(ScalableVecVT), Passthru,
7093 DAG.getConstant(0, DL, MVT::i64));
7094
7095 VecVT = Vec.getValueType();
7096 MaskVT = Mask.getValueType();
7097 }
7098
7099 // Get legal type for compact instruction
7100 EVT ContainerVT = getSVEContainerType(VecVT);
7101 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7102
7103 // Convert to i32 or i64 for smaller types, as these are the only supported
7104 // sizes for compact.
7105 if (ContainerVT != VecVT) {
7106 Vec = DAG.getBitcast(CastVT, Vec);
7107 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7108 }
7109
7110 SDValue Compressed = DAG.getNode(
7112 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7113
7114 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7115 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7116 SDValue Offset = DAG.getNode(
7117 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7118 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7119
7120 SDValue IndexMask = DAG.getNode(
7121 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7122 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7123 DAG.getConstant(0, DL, MVT::i64), Offset);
7124
7125 Compressed =
7126 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7127 }
7128
7129 // Extracting from a legal SVE type before truncating produces better code.
7130 if (IsFixedLength) {
7131 Compressed = DAG.getNode(
7133 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7134 Compressed, DAG.getConstant(0, DL, MVT::i64));
7135 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7136 VecVT = FixedVecVT;
7137 }
7138
7139 // If we changed the element type before, we need to convert it back.
7140 if (ContainerVT != VecVT) {
7141 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7142 Compressed = DAG.getBitcast(VecVT, Compressed);
7143 }
7144
7145 return Compressed;
7146}
7147
7148// Generate SUBS and CSEL for integer abs.
7149SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7150 MVT VT = Op.getSimpleValueType();
7151
7152 if (VT.isVector())
7153 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7154
7155 SDLoc DL(Op);
7156 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7157 Op.getOperand(0));
7158 // Generate SUBS & CSEL.
7159 SDValue Cmp =
7160 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
7161 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7162 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7163 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
7164 Cmp.getValue(1));
7165}
7166
7168 SDValue Chain = Op.getOperand(0);
7169 SDValue Cond = Op.getOperand(1);
7170 SDValue Dest = Op.getOperand(2);
7171
7173 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7174 SDLoc dl(Op);
7175 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
7176 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
7177 Cmp);
7178 }
7179
7180 return SDValue();
7181}
7182
7183// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7184// FSHL is converted to FSHR before deciding what to do with it
7186 SDValue Shifts = Op.getOperand(2);
7187 // Check if the shift amount is a constant
7188 // If opcode is FSHL, convert it to FSHR
7189 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7190 SDLoc DL(Op);
7191 MVT VT = Op.getSimpleValueType();
7192
7193 if (Op.getOpcode() == ISD::FSHL) {
7194 unsigned int NewShiftNo =
7195 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
7196 return DAG.getNode(
7197 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7198 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7199 } else if (Op.getOpcode() == ISD::FSHR) {
7200 return Op;
7201 }
7202 }
7203
7204 return SDValue();
7205}
7206
7208 SDValue X = Op.getOperand(0);
7209 EVT XScalarTy = X.getValueType();
7210 SDValue Exp = Op.getOperand(1);
7211
7212 SDLoc DL(Op);
7213 EVT XVT, ExpVT;
7214 switch (Op.getSimpleValueType().SimpleTy) {
7215 default:
7216 return SDValue();
7217 case MVT::bf16:
7218 case MVT::f16:
7219 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7220 [[fallthrough]];
7221 case MVT::f32:
7222 XVT = MVT::nxv4f32;
7223 ExpVT = MVT::nxv4i32;
7224 break;
7225 case MVT::f64:
7226 XVT = MVT::nxv2f64;
7227 ExpVT = MVT::nxv2i64;
7228 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7229 break;
7230 }
7231
7232 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7233 SDValue VX =
7234 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7235 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7236 DAG.getUNDEF(ExpVT), Exp, Zero);
7237 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7238 AArch64SVEPredPattern::all);
7239 SDValue FScale =
7241 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7242 VPg, VX, VExp);
7243 SDValue Final =
7244 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7245 if (X.getValueType() != XScalarTy)
7246 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7247 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7248 return Final;
7249}
7250
7251SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7252 SelectionDAG &DAG) const {
7253 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7254 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7256 "ADJUST_TRAMPOLINE operation is only supported on Linux.");
7257
7258 return Op.getOperand(0);
7259}
7260
7261SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7262 SelectionDAG &DAG) const {
7263
7264 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7265 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7266 report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
7267
7268 SDValue Chain = Op.getOperand(0);
7269 SDValue Trmp = Op.getOperand(1); // trampoline
7270 SDValue FPtr = Op.getOperand(2); // nested function
7271 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7272 SDLoc dl(Op);
7273
7274 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7275 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
7276
7279
7280 Entry.Ty = IntPtrTy;
7281 Entry.Node = Trmp;
7282 Args.push_back(Entry);
7283
7284 if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
7286 MachineFrameInfo &MFI = MF.getFrameInfo();
7287 Entry.Node =
7288 DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
7289 } else
7290 Entry.Node = DAG.getConstant(36, dl, MVT::i64);
7291
7292 Args.push_back(Entry);
7293 Entry.Node = FPtr;
7294 Args.push_back(Entry);
7295 Entry.Node = Nest;
7296 Args.push_back(Entry);
7297
7298 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
7300 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
7302 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
7303
7304 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7305 return CallResult.second;
7306}
7307
7309 SelectionDAG &DAG) const {
7310 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7311 LLVM_DEBUG(Op.dump());
7312
7313 switch (Op.getOpcode()) {
7314 default:
7315 llvm_unreachable("unimplemented operand");
7316 return SDValue();
7317 case ISD::BITCAST:
7318 return LowerBITCAST(Op, DAG);
7319 case ISD::GlobalAddress:
7320 return LowerGlobalAddress(Op, DAG);
7322 return LowerGlobalTLSAddress(Op, DAG);
7324 return LowerPtrAuthGlobalAddress(Op, DAG);
7326 return LowerADJUST_TRAMPOLINE(Op, DAG);
7328 return LowerINIT_TRAMPOLINE(Op, DAG);
7329 case ISD::SETCC:
7330 case ISD::STRICT_FSETCC:
7332 return LowerSETCC(Op, DAG);
7333 case ISD::SETCCCARRY:
7334 return LowerSETCCCARRY(Op, DAG);
7335 case ISD::BRCOND:
7336 return LowerBRCOND(Op, DAG);
7337 case ISD::BR_CC:
7338 return LowerBR_CC(Op, DAG);
7339 case ISD::SELECT:
7340 return LowerSELECT(Op, DAG);
7341 case ISD::SELECT_CC:
7342 return LowerSELECT_CC(Op, DAG);
7343 case ISD::JumpTable:
7344 return LowerJumpTable(Op, DAG);
7345 case ISD::BR_JT:
7346 return LowerBR_JT(Op, DAG);
7347 case ISD::BRIND:
7348 return LowerBRIND(Op, DAG);
7349 case ISD::ConstantPool:
7350 return LowerConstantPool(Op, DAG);
7351 case ISD::BlockAddress:
7352 return LowerBlockAddress(Op, DAG);
7353 case ISD::VASTART:
7354 return LowerVASTART(Op, DAG);
7355 case ISD::VACOPY:
7356 return LowerVACOPY(Op, DAG);
7357 case ISD::VAARG:
7358 return LowerVAARG(Op, DAG);
7359 case ISD::UADDO_CARRY:
7360 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7361 case ISD::USUBO_CARRY:
7362 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7363 case ISD::SADDO_CARRY:
7364 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7365 case ISD::SSUBO_CARRY:
7366 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7367 case ISD::SADDO:
7368 case ISD::UADDO:
7369 case ISD::SSUBO:
7370 case ISD::USUBO:
7371 case ISD::SMULO:
7372 case ISD::UMULO:
7373 return LowerXALUO(Op, DAG);
7374 case ISD::FADD:
7375 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7376 case ISD::FSUB:
7377 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7378 case ISD::FMUL:
7379 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7380 case ISD::FMA:
7381 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7382 case ISD::FDIV:
7383 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7384 case ISD::FNEG:
7385 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7386 case ISD::FCEIL:
7387 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7388 case ISD::FFLOOR:
7389 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7390 case ISD::FNEARBYINT:
7391 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7392 case ISD::FRINT:
7393 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7394 case ISD::FROUND:
7395 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7396 case ISD::FROUNDEVEN:
7397 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7398 case ISD::FTRUNC:
7399 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7400 case ISD::FSQRT:
7401 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7402 case ISD::FABS:
7403 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7404 case ISD::FP_ROUND:
7406 return LowerFP_ROUND(Op, DAG);
7407 case ISD::FP_EXTEND:
7409 return LowerFP_EXTEND(Op, DAG);
7410 case ISD::FRAMEADDR:
7411 return LowerFRAMEADDR(Op, DAG);
7412 case ISD::SPONENTRY:
7413 return LowerSPONENTRY(Op, DAG);
7414 case ISD::RETURNADDR:
7415 return LowerRETURNADDR(Op, DAG);
7417 return LowerADDROFRETURNADDR(Op, DAG);
7419 return LowerCONCAT_VECTORS(Op, DAG);
7421 return LowerINSERT_VECTOR_ELT(Op, DAG);
7423 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7424 case ISD::BUILD_VECTOR:
7425 return LowerBUILD_VECTOR(Op, DAG);
7427 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7429 return LowerVECTOR_SHUFFLE(Op, DAG);
7430 case ISD::SPLAT_VECTOR:
7431 return LowerSPLAT_VECTOR(Op, DAG);
7433 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7435 return LowerINSERT_SUBVECTOR(Op, DAG);
7436 case ISD::SDIV:
7437 case ISD::UDIV:
7438 return LowerDIV(Op, DAG);
7439 case ISD::SMIN:
7440 case ISD::UMIN:
7441 case ISD::SMAX:
7442 case ISD::UMAX:
7443 return LowerMinMax(Op, DAG);
7444 case ISD::SRA:
7445 case ISD::SRL:
7446 case ISD::SHL:
7447 return LowerVectorSRA_SRL_SHL(Op, DAG);
7448 case ISD::SHL_PARTS:
7449 case ISD::SRL_PARTS:
7450 case ISD::SRA_PARTS:
7451 return LowerShiftParts(Op, DAG);
7452 case ISD::CTPOP:
7453 case ISD::PARITY:
7454 return LowerCTPOP_PARITY(Op, DAG);
7455 case ISD::FCOPYSIGN:
7456 return LowerFCOPYSIGN(Op, DAG);
7457 case ISD::OR:
7458 return LowerVectorOR(Op, DAG);
7459 case ISD::XOR:
7460 return LowerXOR(Op, DAG);
7461 case ISD::PREFETCH:
7462 return LowerPREFETCH(Op, DAG);
7463 case ISD::SINT_TO_FP:
7464 case ISD::UINT_TO_FP:
7467 return LowerINT_TO_FP(Op, DAG);
7468 case ISD::FP_TO_SINT:
7469 case ISD::FP_TO_UINT:
7472 return LowerFP_TO_INT(Op, DAG);
7475 return LowerFP_TO_INT_SAT(Op, DAG);
7476 case ISD::FSINCOS:
7477 return LowerFSINCOS(Op, DAG);
7478 case ISD::GET_ROUNDING:
7479 return LowerGET_ROUNDING(Op, DAG);
7480 case ISD::SET_ROUNDING:
7481 return LowerSET_ROUNDING(Op, DAG);
7482 case ISD::GET_FPMODE:
7483 return LowerGET_FPMODE(Op, DAG);
7484 case ISD::SET_FPMODE:
7485 return LowerSET_FPMODE(Op, DAG);
7486 case ISD::RESET_FPMODE:
7487 return LowerRESET_FPMODE(Op, DAG);
7488 case ISD::MUL:
7489 return LowerMUL(Op, DAG);
7490 case ISD::MULHS:
7491 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7492 case ISD::MULHU:
7493 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7495 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7497 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7499 return LowerINTRINSIC_VOID(Op, DAG);
7500 case ISD::ATOMIC_STORE:
7501 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7502 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7503 return LowerStore128(Op, DAG);
7504 }
7505 return SDValue();
7506 case ISD::STORE:
7507 return LowerSTORE(Op, DAG);
7508 case ISD::MSTORE:
7509 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7510 case ISD::MGATHER:
7511 return LowerMGATHER(Op, DAG);
7512 case ISD::MSCATTER:
7513 return LowerMSCATTER(Op, DAG);
7515 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7516 case ISD::VECREDUCE_ADD:
7517 case ISD::VECREDUCE_AND:
7518 case ISD::VECREDUCE_OR:
7519 case ISD::VECREDUCE_XOR:
7529 return LowerVECREDUCE(Op, DAG);
7531 return LowerATOMIC_LOAD_AND(Op, DAG);
7533 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7534 case ISD::VSCALE:
7535 return LowerVSCALE(Op, DAG);
7537 return LowerVECTOR_COMPRESS(Op, DAG);
7538 case ISD::ANY_EXTEND:
7539 case ISD::SIGN_EXTEND:
7540 case ISD::ZERO_EXTEND:
7541 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7543 // Only custom lower when ExtraVT has a legal byte based element type.
7544 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7545 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7546 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7547 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7548 return SDValue();
7549
7550 return LowerToPredicatedOp(Op, DAG,
7552 }
7553 case ISD::TRUNCATE:
7554 return LowerTRUNCATE(Op, DAG);
7555 case ISD::MLOAD:
7556 return LowerMLOAD(Op, DAG);
7557 case ISD::LOAD:
7558 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7559 !Subtarget->isNeonAvailable()))
7560 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7561 return LowerLOAD(Op, DAG);
7562 case ISD::ADD:
7563 case ISD::AND:
7564 case ISD::SUB:
7565 return LowerToScalableOp(Op, DAG);
7566 case ISD::FMAXIMUM:
7567 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7568 case ISD::FMAXNUM:
7569 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7570 case ISD::FMINIMUM:
7571 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7572 case ISD::FMINNUM:
7573 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7574 case ISD::VSELECT:
7575 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7576 case ISD::ABS:
7577 return LowerABS(Op, DAG);
7578 case ISD::ABDS:
7579 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7580 case ISD::ABDU:
7581 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7582 case ISD::AVGFLOORS:
7583 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7584 case ISD::AVGFLOORU:
7585 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7586 case ISD::AVGCEILS:
7587 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7588 case ISD::AVGCEILU:
7589 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7590 case ISD::BITREVERSE:
7591 return LowerBitreverse(Op, DAG);
7592 case ISD::BSWAP:
7593 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7594 case ISD::CTLZ:
7595 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7596 case ISD::CTTZ:
7597 return LowerCTTZ(Op, DAG);
7598 case ISD::VECTOR_SPLICE:
7599 return LowerVECTOR_SPLICE(Op, DAG);
7601 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7603 return LowerVECTOR_INTERLEAVE(Op, DAG);
7604 case ISD::LRINT:
7605 case ISD::LLRINT:
7606 if (Op.getValueType().isVector())
7607 return LowerVectorXRINT(Op, DAG);
7608 [[fallthrough]];
7609 case ISD::LROUND:
7610 case ISD::LLROUND: {
7611 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7612 Op.getOperand(0).getValueType() == MVT::bf16) &&
7613 "Expected custom lowering of rounding operations only for f16");
7614 SDLoc DL(Op);
7615 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7616 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7617 }
7618 case ISD::STRICT_LROUND:
7620 case ISD::STRICT_LRINT:
7621 case ISD::STRICT_LLRINT: {
7622 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7623 Op.getOperand(1).getValueType() == MVT::bf16) &&
7624 "Expected custom lowering of rounding operations only for f16");
7625 SDLoc DL(Op);
7626 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7627 {Op.getOperand(0), Op.getOperand(1)});
7628 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7629 {Ext.getValue(1), Ext.getValue(0)});
7630 }
7631 case ISD::WRITE_REGISTER: {
7632 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7633 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7634 SDLoc DL(Op);
7635
7636 SDValue Chain = Op.getOperand(0);
7637 SDValue SysRegName = Op.getOperand(1);
7638 std::pair<SDValue, SDValue> Pair =
7639 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7640
7641 // chain = MSRR(chain, sysregname, lo, hi)
7642 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7643 SysRegName, Pair.first, Pair.second);
7644
7645 return Result;
7646 }
7647 case ISD::FSHL:
7648 case ISD::FSHR:
7649 return LowerFunnelShift(Op, DAG);
7650 case ISD::FLDEXP:
7651 return LowerFLDEXP(Op, DAG);
7653 return LowerVECTOR_HISTOGRAM(Op, DAG);
7654 }
7655}
7656
7658 return !Subtarget->useSVEForFixedLengthVectors();
7659}
7660
7662 EVT VT, bool OverrideNEON) const {
7663 if (!VT.isFixedLengthVector() || !VT.isSimple())
7664 return false;
7665
7666 // Don't use SVE for vectors we cannot scalarize if required.
7667 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7668 // Fixed length predicates should be promoted to i8.
7669 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7670 case MVT::i1:
7671 default:
7672 return false;
7673 case MVT::i8:
7674 case MVT::i16:
7675 case MVT::i32:
7676 case MVT::i64:
7677 case MVT::f16:
7678 case MVT::f32:
7679 case MVT::f64:
7680 break;
7681 }
7682
7683 // NEON-sized vectors can be emulated using SVE instructions.
7684 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7685 return Subtarget->isSVEorStreamingSVEAvailable();
7686
7687 // Ensure NEON MVTs only belong to a single register class.
7688 if (VT.getFixedSizeInBits() <= 128)
7689 return false;
7690
7691 // Ensure wider than NEON code generation is enabled.
7692 if (!Subtarget->useSVEForFixedLengthVectors())
7693 return false;
7694
7695 // Don't use SVE for types that don't fit.
7696 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7697 return false;
7698
7699 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7700 // the base fixed length SVE support in place.
7701 if (!VT.isPow2VectorType())
7702 return false;
7703
7704 return true;
7705}
7706
7707//===----------------------------------------------------------------------===//
7708// Calling Convention Implementation
7709//===----------------------------------------------------------------------===//
7710
7711static unsigned getIntrinsicID(const SDNode *N) {
7712 unsigned Opcode = N->getOpcode();
7713 switch (Opcode) {
7714 default:
7717 unsigned IID = N->getConstantOperandVal(0);
7718 if (IID < Intrinsic::num_intrinsics)
7719 return IID;
7721 }
7722 }
7723}
7724
7726 SDValue N1) const {
7727 if (!N0.hasOneUse())
7728 return false;
7729
7730 unsigned IID = getIntrinsicID(N1.getNode());
7731 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7732 if (IID == Intrinsic::aarch64_neon_umull ||
7733 N1.getOpcode() == AArch64ISD::UMULL ||
7734 IID == Intrinsic::aarch64_neon_smull ||
7736 return N0.getOpcode() != ISD::ADD;
7737
7738 return true;
7739}
7740
7741/// Selects the correct CCAssignFn for a given CallingConvention value.
7743 bool IsVarArg) const {
7744 switch (CC) {
7745 default:
7746 report_fatal_error("Unsupported calling convention.");
7747 case CallingConv::GHC:
7748 return CC_AArch64_GHC;
7750 // The VarArg implementation makes assumptions about register
7751 // argument passing that do not hold for preserve_none, so we
7752 // instead fall back to C argument passing.
7753 // The non-vararg case is handled in the CC function itself.
7754 if (!IsVarArg)
7756 [[fallthrough]];
7757 case CallingConv::C:
7758 case CallingConv::Fast:
7762 case CallingConv::Swift:
7764 case CallingConv::Tail:
7765 case CallingConv::GRAAL:
7766 if (Subtarget->isTargetWindows()) {
7767 if (IsVarArg) {
7768 if (Subtarget->isWindowsArm64EC())
7771 }
7772 return CC_AArch64_Win64PCS;
7773 }
7774 if (!Subtarget->isTargetDarwin())
7775 return CC_AArch64_AAPCS;
7776 if (!IsVarArg)
7777 return CC_AArch64_DarwinPCS;
7780 case CallingConv::Win64:
7781 if (IsVarArg) {
7782 if (Subtarget->isWindowsArm64EC())
7785 }
7786 return CC_AArch64_Win64PCS;
7788 if (Subtarget->isWindowsArm64EC())
7796 return CC_AArch64_AAPCS;
7801 }
7802}
7803
7804CCAssignFn *
7806 switch (CC) {
7807 default:
7808 return RetCC_AArch64_AAPCS;
7812 if (Subtarget->isWindowsArm64EC())
7814 return RetCC_AArch64_AAPCS;
7815 }
7816}
7817
7818static bool isPassedInFPR(EVT VT) {
7819 return VT.isFixedLengthVector() ||
7820 (VT.isFloatingPoint() && !VT.isScalableVector());
7821}
7822
7823SDValue AArch64TargetLowering::LowerFormalArguments(
7824 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7825 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7826 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7828 const Function &F = MF.getFunction();
7829 MachineFrameInfo &MFI = MF.getFrameInfo();
7830 bool IsWin64 =
7831 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7832 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7833 (isVarArg && Subtarget->isWindowsArm64EC());
7835
7837 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7839 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7840 FuncInfo->setIsSVECC(true);
7841
7842 // Assign locations to all of the incoming arguments.
7844 DenseMap<unsigned, SDValue> CopiedRegs;
7845 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7846
7847 // At this point, Ins[].VT may already be promoted to i32. To correctly
7848 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7849 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7850 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7851 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7852 // LocVT.
7853 unsigned NumArgs = Ins.size();
7854 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7855 unsigned CurArgIdx = 0;
7856 for (unsigned i = 0; i != NumArgs; ++i) {
7857 MVT ValVT = Ins[i].VT;
7858 if (Ins[i].isOrigArg()) {
7859 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7860 CurArgIdx = Ins[i].getOrigArgIndex();
7861
7862 // Get type of the original argument.
7863 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7864 /*AllowUnknown*/ true);
7865 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7866 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7867 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7868 ValVT = MVT::i8;
7869 else if (ActualMVT == MVT::i16)
7870 ValVT = MVT::i16;
7871 }
7872 bool UseVarArgCC = false;
7873 if (IsWin64)
7874 UseVarArgCC = isVarArg;
7875 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7876 bool Res =
7877 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7878 assert(!Res && "Call operand has unhandled type");
7879 (void)Res;
7880 }
7881
7883 bool IsLocallyStreaming =
7884 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7885 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7886 SDValue Glue = Chain.getValue(1);
7887
7888 SmallVector<SDValue, 16> ArgValues;
7889 unsigned ExtraArgLocs = 0;
7890 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7891 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7892
7893 if (Ins[i].Flags.isByVal()) {
7894 // Byval is used for HFAs in the PCS, but the system should work in a
7895 // non-compliant manner for larger structs.
7896 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7897 int Size = Ins[i].Flags.getByValSize();
7898 unsigned NumRegs = (Size + 7) / 8;
7899
7900 // FIXME: This works on big-endian for composite byvals, which are the common
7901 // case. It should also work for fundamental types too.
7902 unsigned FrameIdx =
7903 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7904 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7905 InVals.push_back(FrameIdxN);
7906
7907 continue;
7908 }
7909
7910 if (Ins[i].Flags.isSwiftAsync())
7912
7913 SDValue ArgValue;
7914 if (VA.isRegLoc()) {
7915 // Arguments stored in registers.
7916 EVT RegVT = VA.getLocVT();
7917 const TargetRegisterClass *RC;
7918
7919 if (RegVT == MVT::i32)
7920 RC = &AArch64::GPR32RegClass;
7921 else if (RegVT == MVT::i64)
7922 RC = &AArch64::GPR64RegClass;
7923 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7924 RC = &AArch64::FPR16RegClass;
7925 else if (RegVT == MVT::f32)
7926 RC = &AArch64::FPR32RegClass;
7927 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7928 RC = &AArch64::FPR64RegClass;
7929 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7930 RC = &AArch64::FPR128RegClass;
7931 else if (RegVT.isScalableVector() &&
7932 RegVT.getVectorElementType() == MVT::i1) {
7933 FuncInfo->setIsSVECC(true);
7934 RC = &AArch64::PPRRegClass;
7935 } else if (RegVT == MVT::aarch64svcount) {
7936 FuncInfo->setIsSVECC(true);
7937 RC = &AArch64::PPRRegClass;
7938 } else if (RegVT.isScalableVector()) {
7939 FuncInfo->setIsSVECC(true);
7940 RC = &AArch64::ZPRRegClass;
7941 } else
7942 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7943
7944 // Transform the arguments in physical registers into virtual ones.
7945 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7946
7947 if (IsLocallyStreaming) {
7948 // LocallyStreamingFunctions must insert the SMSTART in the correct
7949 // position, so we use Glue to ensure no instructions can be scheduled
7950 // between the chain of:
7951 // t0: ch,glue = EntryNode
7952 // t1: res,ch,glue = CopyFromReg
7953 // ...
7954 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7955 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7956 // ^^^^^^
7957 // This will be the new Chain/Root node.
7958 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7959 Glue = ArgValue.getValue(2);
7960 if (isPassedInFPR(ArgValue.getValueType())) {
7961 ArgValue =
7963 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7964 {ArgValue, Glue});
7965 Glue = ArgValue.getValue(1);
7966 }
7967 } else
7968 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7969
7970 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7971 // to 64 bits. Insert an assert[sz]ext to capture this, then
7972 // truncate to the right size.
7973 switch (VA.getLocInfo()) {
7974 default:
7975 llvm_unreachable("Unknown loc info!");
7976 case CCValAssign::Full:
7977 break;
7979 assert(
7980 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7981 "Indirect arguments should be scalable on most subtargets");
7982 break;
7983 case CCValAssign::BCvt:
7984 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7985 break;
7986 case CCValAssign::AExt:
7987 case CCValAssign::SExt:
7988 case CCValAssign::ZExt:
7989 break;
7991 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7992 DAG.getConstant(32, DL, RegVT));
7993 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7994 break;
7995 }
7996 } else { // VA.isRegLoc()
7997 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7998 unsigned ArgOffset = VA.getLocMemOffset();
7999 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8000 ? VA.getLocVT().getSizeInBits()
8001 : VA.getValVT().getSizeInBits()) / 8;
8002
8003 uint32_t BEAlign = 0;
8004 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8005 !Ins[i].Flags.isInConsecutiveRegs())
8006 BEAlign = 8 - ArgSize;
8007
8008 SDValue FIN;
8009 MachinePointerInfo PtrInfo;
8010 if (StackViaX4) {
8011 // In both the ARM64EC varargs convention and the thunk convention,
8012 // arguments on the stack are accessed relative to x4, not sp. In
8013 // the thunk convention, there's an additional offset of 32 bytes
8014 // to account for the shadow store.
8015 unsigned ObjOffset = ArgOffset + BEAlign;
8016 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8017 ObjOffset += 32;
8018 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8019 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8020 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8021 DAG.getConstant(ObjOffset, DL, MVT::i64));
8023 } else {
8024 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8025
8026 // Create load nodes to retrieve arguments from the stack.
8027 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8028 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8029 }
8030
8031 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8033 MVT MemVT = VA.getValVT();
8034
8035 switch (VA.getLocInfo()) {
8036 default:
8037 break;
8038 case CCValAssign::Trunc:
8039 case CCValAssign::BCvt:
8040 MemVT = VA.getLocVT();
8041 break;
8044 Subtarget->isWindowsArm64EC()) &&
8045 "Indirect arguments should be scalable on most subtargets");
8046 MemVT = VA.getLocVT();
8047 break;
8048 case CCValAssign::SExt:
8049 ExtType = ISD::SEXTLOAD;
8050 break;
8051 case CCValAssign::ZExt:
8052 ExtType = ISD::ZEXTLOAD;
8053 break;
8054 case CCValAssign::AExt:
8055 ExtType = ISD::EXTLOAD;
8056 break;
8057 }
8058
8059 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8060 MemVT);
8061 }
8062
8063 if (VA.getLocInfo() == CCValAssign::Indirect) {
8064 assert((VA.getValVT().isScalableVT() ||
8065 Subtarget->isWindowsArm64EC()) &&
8066 "Indirect arguments should be scalable on most subtargets");
8067
8068 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8069 unsigned NumParts = 1;
8070 if (Ins[i].Flags.isInConsecutiveRegs()) {
8071 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8072 ++NumParts;
8073 }
8074
8075 MVT PartLoad = VA.getValVT();
8076 SDValue Ptr = ArgValue;
8077
8078 // Ensure we generate all loads for each tuple part, whilst updating the
8079 // pointer after each load correctly using vscale.
8080 while (NumParts > 0) {
8081 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8082 InVals.push_back(ArgValue);
8083 NumParts--;
8084 if (NumParts > 0) {
8085 SDValue BytesIncrement;
8086 if (PartLoad.isScalableVector()) {
8087 BytesIncrement = DAG.getVScale(
8088 DL, Ptr.getValueType(),
8089 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8090 } else {
8091 BytesIncrement = DAG.getConstant(
8092 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8093 Ptr.getValueType());
8094 }
8095 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8096 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8097 ExtraArgLocs++;
8098 i++;
8099 }
8100 }
8101 } else {
8102 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8103 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8104 ArgValue, DAG.getValueType(MVT::i32));
8105
8106 // i1 arguments are zero-extended to i8 by the caller. Emit a
8107 // hint to reflect this.
8108 if (Ins[i].isOrigArg()) {
8109 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8110 if (OrigArg->getType()->isIntegerTy(1)) {
8111 if (!Ins[i].Flags.isZExt()) {
8112 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8113 ArgValue.getValueType(), ArgValue);
8114 }
8115 }
8116 }
8117
8118 InVals.push_back(ArgValue);
8119 }
8120 }
8121 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8122
8123 // Insert the SMSTART if this is a locally streaming function and
8124 // make sure it is Glued to the last CopyFromReg value.
8125 if (IsLocallyStreaming) {
8126 SDValue PStateSM;
8127 if (Attrs.hasStreamingCompatibleInterface()) {
8128 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8131 FuncInfo->setPStateSMReg(Reg);
8132 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
8133 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8135 } else
8136 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8138
8139 // Ensure that the SMSTART happens after the CopyWithChain such that its
8140 // chain result is used.
8141 for (unsigned I=0; I<InVals.size(); ++I) {
8144 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8145 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8146 InVals[I].getValueType());
8147 }
8148 }
8149
8150 // varargs
8151 if (isVarArg) {
8152 if (!Subtarget->isTargetDarwin() || IsWin64) {
8153 // The AAPCS variadic function ABI is identical to the non-variadic
8154 // one. As a result there may be more arguments in registers and we should
8155 // save them for future reference.
8156 // Win64 variadic functions also pass arguments in registers, but all float
8157 // arguments are passed in integer registers.
8158 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8159 }
8160
8161 // This will point to the next argument passed via stack.
8162 unsigned VarArgsOffset = CCInfo.getStackSize();
8163 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8164 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8165 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8166 FuncInfo->setVarArgsStackIndex(
8167 MFI.CreateFixedObject(4, VarArgsOffset, true));
8168
8169 if (MFI.hasMustTailInVarArgFunc()) {
8170 SmallVector<MVT, 2> RegParmTypes;
8171 RegParmTypes.push_back(MVT::i64);
8172 RegParmTypes.push_back(MVT::f128);
8173 // Compute the set of forwarded registers. The rest are scratch.
8175 FuncInfo->getForwardedMustTailRegParms();
8176 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8178
8179 // Conservatively forward X8, since it might be used for aggregate return.
8180 if (!CCInfo.isAllocated(AArch64::X8)) {
8181 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8182 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8183 }
8184 }
8185 }
8186
8187 // On Windows, InReg pointers must be returned, so record the pointer in a
8188 // virtual register at the start of the function so it can be returned in the
8189 // epilogue.
8190 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8191 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8192 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8193 Ins[I].Flags.isInReg()) &&
8194 Ins[I].Flags.isSRet()) {
8195 assert(!FuncInfo->getSRetReturnReg());
8196
8197 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8198 Register Reg =
8200 FuncInfo->setSRetReturnReg(Reg);
8201
8202 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8203 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8204 break;
8205 }
8206 }
8207 }
8208
8209 unsigned StackArgSize = CCInfo.getStackSize();
8210 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8211 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8212 // This is a non-standard ABI so by fiat I say we're allowed to make full
8213 // use of the stack area to be popped, which must be aligned to 16 bytes in
8214 // any case:
8215 StackArgSize = alignTo(StackArgSize, 16);
8216
8217 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8218 // a multiple of 16.
8219 FuncInfo->setArgumentStackToRestore(StackArgSize);
8220
8221 // This realignment carries over to the available bytes below. Our own
8222 // callers will guarantee the space is free by giving an aligned value to
8223 // CALLSEQ_START.
8224 }
8225 // Even if we're not expected to free up the space, it's useful to know how
8226 // much is there while considering tail calls (because we can reuse it).
8227 FuncInfo->setBytesInStackArgArea(StackArgSize);
8228
8229 if (Subtarget->hasCustomCallingConv())
8231
8232 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8233 // will be expanded and stored in the static object later using a pseudonode.
8234 if (SMEAttrs(MF.getFunction()).hasZAState()) {
8235 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8236 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8237 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8238 DAG.getConstant(1, DL, MVT::i32));
8239
8240 SDValue Buffer;
8241 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8243 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8244 } else {
8245 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8246 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8247 DAG.getVTList(MVT::i64, MVT::Other),
8248 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8249 MFI.CreateVariableSizedObject(Align(16), nullptr);
8250 }
8251 Chain = DAG.getNode(
8252 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8253 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
8254 } else if (SMEAttrs(MF.getFunction()).hasAgnosticZAInterface()) {
8255 // Call __arm_sme_state_size().
8256 SDValue BufferSize =
8258 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8259 Chain = BufferSize.getValue(1);
8260
8261 SDValue Buffer;
8262 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8263 Buffer =
8265 DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize});
8266 } else {
8267 // Allocate space dynamically.
8268 Buffer = DAG.getNode(
8269 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8270 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8271 MFI.CreateVariableSizedObject(Align(16), nullptr);
8272 }
8273
8274 // Copy the value to a virtual register, and save that in FuncInfo.
8275 Register BufferPtr =
8276 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8277 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8278 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8279 }
8280
8281 if (CallConv == CallingConv::PreserveNone) {
8282 for (const ISD::InputArg &I : Ins) {
8283 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8284 I.Flags.isSwiftAsync()) {
8287 MF.getFunction(),
8288 "Swift attributes can't be used with preserve_none",
8289 DL.getDebugLoc()));
8290 break;
8291 }
8292 }
8293 }
8294
8295 return Chain;
8296}
8297
8298void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8299 SelectionDAG &DAG,
8300 const SDLoc &DL,
8301 SDValue &Chain) const {
8303 MachineFrameInfo &MFI = MF.getFrameInfo();
8305 auto PtrVT = getPointerTy(DAG.getDataLayout());
8306 Function &F = MF.getFunction();
8307 bool IsWin64 =
8308 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8309
8311
8313 unsigned NumGPRArgRegs = GPRArgRegs.size();
8314 if (Subtarget->isWindowsArm64EC()) {
8315 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8316 // functions.
8317 NumGPRArgRegs = 4;
8318 }
8319 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8320
8321 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8322 int GPRIdx = 0;
8323 if (GPRSaveSize != 0) {
8324 if (IsWin64) {
8325 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8326 if (GPRSaveSize & 15)
8327 // The extra size here, if triggered, will always be 8.
8328 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8329 } else
8330 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8331
8332 SDValue FIN;
8333 if (Subtarget->isWindowsArm64EC()) {
8334 // With the Arm64EC ABI, we reserve the save area as usual, but we
8335 // compute its address relative to x4. For a normal AArch64->AArch64
8336 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8337 // different address.
8338 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8339 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8340 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8341 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8342 } else {
8343 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8344 }
8345
8346 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8347 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8348 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8349 SDValue Store =
8350 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8352 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8353 : MachinePointerInfo::getStack(MF, i * 8));
8354 MemOps.push_back(Store);
8355 FIN =
8356 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8357 }
8358 }
8359 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8360 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8361
8362 if (Subtarget->hasFPARMv8() && !IsWin64) {
8364 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8365 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8366
8367 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8368 int FPRIdx = 0;
8369 if (FPRSaveSize != 0) {
8370 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8371
8372 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8373
8374 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8375 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8376 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8377
8378 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8379 MachinePointerInfo::getStack(MF, i * 16));
8380 MemOps.push_back(Store);
8381 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8382 DAG.getConstant(16, DL, PtrVT));
8383 }
8384 }
8385 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8386 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8387 }
8388
8389 if (!MemOps.empty()) {
8390 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8391 }
8392}
8393
8394/// LowerCallResult - Lower the result values of a call into the
8395/// appropriate copies out of appropriate physical registers.
8396SDValue AArch64TargetLowering::LowerCallResult(
8397 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8398 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8399 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8400 SDValue ThisVal, bool RequiresSMChange) const {
8401 DenseMap<unsigned, SDValue> CopiedRegs;
8402 // Copy all of the result registers out of their specified physreg.
8403 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8404 CCValAssign VA = RVLocs[i];
8405
8406 // Pass 'this' value directly from the argument to return value, to avoid
8407 // reg unit interference
8408 if (i == 0 && isThisReturn) {
8409 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8410 "unexpected return calling convention register assignment");
8411 InVals.push_back(ThisVal);
8412 continue;
8413 }
8414
8415 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8416 // allows one use of a physreg per block.
8417 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8418 if (!Val) {
8419 Val =
8420 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8421 Chain = Val.getValue(1);
8422 InGlue = Val.getValue(2);
8423 CopiedRegs[VA.getLocReg()] = Val;
8424 }
8425
8426 switch (VA.getLocInfo()) {
8427 default:
8428 llvm_unreachable("Unknown loc info!");
8429 case CCValAssign::Full:
8430 break;
8431 case CCValAssign::BCvt:
8432 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8433 break;
8435 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8436 DAG.getConstant(32, DL, VA.getLocVT()));
8437 [[fallthrough]];
8438 case CCValAssign::AExt:
8439 [[fallthrough]];
8440 case CCValAssign::ZExt:
8441 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8442 break;
8443 }
8444
8445 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8447 Val);
8448
8449 InVals.push_back(Val);
8450 }
8451
8452 return Chain;
8453}
8454
8455/// Return true if the calling convention is one that we can guarantee TCO for.
8456static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8457 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8459}
8460
8461/// Return true if we might ever do TCO for calls with this calling convention.
8463 switch (CC) {
8464 case CallingConv::C:
8469 case CallingConv::Swift:
8471 case CallingConv::Tail:
8472 case CallingConv::Fast:
8473 return true;
8474 default:
8475 return false;
8476 }
8477}
8478
8479/// Return true if the call convention supports varargs
8480/// Currently only those that pass varargs like the C
8481/// calling convention does are eligible
8482/// Calling conventions listed in this function must also
8483/// be properly handled in AArch64Subtarget::isCallingConvWin64
8485 switch (CC) {
8486 case CallingConv::C:
8488 return true;
8489 default:
8490 return false;
8491 }
8492}
8493
8495 const AArch64Subtarget *Subtarget,
8497 CCState &CCInfo) {
8498 const SelectionDAG &DAG = CLI.DAG;
8499 CallingConv::ID CalleeCC = CLI.CallConv;
8500 bool IsVarArg = CLI.IsVarArg;
8501 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8502 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8503
8504 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8505 // for the shadow store.
8506 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8507 CCInfo.AllocateStack(32, Align(16));
8508
8509 unsigned NumArgs = Outs.size();
8510 for (unsigned i = 0; i != NumArgs; ++i) {
8511 MVT ArgVT = Outs[i].VT;
8512 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8513
8514 bool UseVarArgCC = false;
8515 if (IsVarArg) {
8516 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8517 // too, so use the vararg CC to force them to integer registers.
8518 if (IsCalleeWin64) {
8519 UseVarArgCC = true;
8520 } else {
8521 UseVarArgCC = !Outs[i].IsFixed;
8522 }
8523 }
8524
8525 if (!UseVarArgCC) {
8526 // Get type of the original argument.
8527 EVT ActualVT =
8528 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8529 /*AllowUnknown*/ true);
8530 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8531 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8532 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8533 ArgVT = MVT::i8;
8534 else if (ActualMVT == MVT::i16)
8535 ArgVT = MVT::i16;
8536 }
8537
8538 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8539 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
8540 assert(!Res && "Call operand has unhandled type");
8541 (void)Res;
8542 }
8543}
8544
8545bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8546 const CallLoweringInfo &CLI) const {
8547 CallingConv::ID CalleeCC = CLI.CallConv;
8548 if (!mayTailCallThisCC(CalleeCC))
8549 return false;
8550
8551 SDValue Callee = CLI.Callee;
8552 bool IsVarArg = CLI.IsVarArg;
8553 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8554 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8555 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8556 const SelectionDAG &DAG = CLI.DAG;
8558 const Function &CallerF = MF.getFunction();
8559 CallingConv::ID CallerCC = CallerF.getCallingConv();
8560
8561 // SME Streaming functions are not eligible for TCO as they may require
8562 // the streaming mode or ZA to be restored after returning from the call.
8563 SMEAttrs CallerAttrs(MF.getFunction());
8564 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
8565 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
8566 CallerAttrs.requiresLazySave(CalleeAttrs) ||
8567 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs) ||
8568 CallerAttrs.hasStreamingBody())
8569 return false;
8570
8571 // Functions using the C or Fast calling convention that have an SVE signature
8572 // preserve more registers and should assume the SVE_VectorCall CC.
8573 // The check for matching callee-saved regs will determine whether it is
8574 // eligible for TCO.
8575 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8578
8579 bool CCMatch = CallerCC == CalleeCC;
8580
8581 // When using the Windows calling convention on a non-windows OS, we want
8582 // to back up and restore X18 in such functions; we can't do a tail call
8583 // from those functions.
8584 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8585 CalleeCC != CallingConv::Win64)
8586 return false;
8587
8588 // Byval parameters hand the function a pointer directly into the stack area
8589 // we want to reuse during a tail call. Working around this *is* possible (see
8590 // X86) but less efficient and uglier in LowerCall.
8591 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8592 e = CallerF.arg_end();
8593 i != e; ++i) {
8594 if (i->hasByValAttr())
8595 return false;
8596
8597 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8598 // In this case, it is necessary to save/restore X0 in the callee. Tail
8599 // call opt interferes with this. So we disable tail call opt when the
8600 // caller has an argument with "inreg" attribute.
8601
8602 // FIXME: Check whether the callee also has an "inreg" argument.
8603 if (i->hasInRegAttr())
8604 return false;
8605 }
8606
8607 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8608 return CCMatch;
8609
8610 // Externally-defined functions with weak linkage should not be
8611 // tail-called on AArch64 when the OS does not support dynamic
8612 // pre-emption of symbols, as the AAELF spec requires normal calls
8613 // to undefined weak functions to be replaced with a NOP or jump to the
8614 // next instruction. The behaviour of branch instructions in this
8615 // situation (as used for tail calls) is implementation-defined, so we
8616 // cannot rely on the linker replacing the tail call with a return.
8617 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8618 const GlobalValue *GV = G->getGlobal();
8620 if (GV->hasExternalWeakLinkage() &&
8621 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8622 return false;
8623 }
8624
8625 // Now we search for cases where we can use a tail call without changing the
8626 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8627 // concept.
8628
8629 // I want anyone implementing a new calling convention to think long and hard
8630 // about this assert.
8631 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8632 report_fatal_error("Unsupported variadic calling convention");
8633
8634 LLVMContext &C = *DAG.getContext();
8635 // Check that the call results are passed in the same way.
8636 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8637 CCAssignFnForCall(CalleeCC, IsVarArg),
8638 CCAssignFnForCall(CallerCC, IsVarArg)))
8639 return false;
8640 // The callee has to preserve all registers the caller needs to preserve.
8641 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8642 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8643 if (!CCMatch) {
8644 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8645 if (Subtarget->hasCustomCallingConv()) {
8646 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8647 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8648 }
8649 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8650 return false;
8651 }
8652
8653 // Nothing more to check if the callee is taking no arguments
8654 if (Outs.empty())
8655 return true;
8656
8658 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8659
8660 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8661
8662 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8663 // When we are musttail, additional checks have been done and we can safely ignore this check
8664 // At least two cases here: if caller is fastcc then we can't have any
8665 // memory arguments (we'd be expected to clean up the stack afterwards). If
8666 // caller is C then we could potentially use its argument area.
8667
8668 // FIXME: for now we take the most conservative of these in both cases:
8669 // disallow all variadic memory operands.
8670 for (const CCValAssign &ArgLoc : ArgLocs)
8671 if (!ArgLoc.isRegLoc())
8672 return false;
8673 }
8674
8675 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8676
8677 // If any of the arguments is passed indirectly, it must be SVE, so the
8678 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8679 // allocate space on the stack. That is why we determine this explicitly here
8680 // the call cannot be a tailcall.
8681 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8682 assert((A.getLocInfo() != CCValAssign::Indirect ||
8683 A.getValVT().isScalableVector() ||
8684 Subtarget->isWindowsArm64EC()) &&
8685 "Expected value to be scalable");
8686 return A.getLocInfo() == CCValAssign::Indirect;
8687 }))
8688 return false;
8689
8690 // If the stack arguments for this call do not fit into our own save area then
8691 // the call cannot be made tail.
8692 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8693 return false;
8694
8695 const MachineRegisterInfo &MRI = MF.getRegInfo();
8696 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8697 return false;
8698
8699 return true;
8700}
8701
8702SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8703 SelectionDAG &DAG,
8704 MachineFrameInfo &MFI,
8705 int ClobberedFI) const {
8706 SmallVector<SDValue, 8> ArgChains;
8707 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8708 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8709
8710 // Include the original chain at the beginning of the list. When this is
8711 // used by target LowerCall hooks, this helps legalize find the
8712 // CALLSEQ_BEGIN node.
8713 ArgChains.push_back(Chain);
8714
8715 // Add a chain value for each stack argument corresponding
8716 for (SDNode *U : DAG.getEntryNode().getNode()->users())
8717 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8718 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8719 if (FI->getIndex() < 0) {
8720 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8721 int64_t InLastByte = InFirstByte;
8722 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8723
8724 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8725 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8726 ArgChains.push_back(SDValue(L, 1));
8727 }
8728
8729 // Build a tokenfactor for all the chains.
8730 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8731}
8732
8733bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8734 bool TailCallOpt) const {
8735 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8736 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8737}
8738
8739// Check if the value is zero-extended from i1 to i8
8740static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8741 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8742 if (SizeInBits < 8)
8743 return false;
8744
8745 APInt RequredZero(SizeInBits, 0xFE);
8746 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8747 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8748 return ZExtBool;
8749}
8750
8751// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
8752// input operands are copy nodes where the source register is in a
8753// StridedOrContiguous class. For example:
8754//
8755// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
8756// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
8757// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
8758// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
8759// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
8760// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
8761// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
8762//
8764 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
8765
8766 const TargetRegisterClass *RegClass = nullptr;
8767 switch (MI.getOpcode()) {
8768 case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
8769 RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
8770 break;
8771 case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
8772 RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
8773 break;
8774 default:
8775 llvm_unreachable("Unexpected opcode.");
8776 }
8777
8779 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8780 MachineOperand &MO = MI.getOperand(I);
8781 assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
8782
8783 MachineOperand *Def = MRI.getOneDef(MO.getReg());
8784 if (!Def || !Def->getParent()->isCopy())
8785 return false;
8786
8787 const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
8788 unsigned OpSubReg = CopySrc.getSubReg();
8790 SubReg = OpSubReg;
8791
8792 MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
8793 if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
8794 MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
8795 return false;
8796 }
8797
8798 return true;
8799}
8800
8801void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8802 SDNode *Node) const {
8803 // Live-in physreg copies that are glued to SMSTART are applied as
8804 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8805 // register allocator to pass call args in callee saved regs, without extra
8806 // copies to avoid these fake clobbers of actually-preserved GPRs.
8807 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8808 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8809 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8810 if (MachineOperand &MO = MI.getOperand(I);
8811 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8812 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8813 AArch64::GPR64RegClass.contains(MO.getReg())))
8814 MI.removeOperand(I);
8815
8816 // The SVE vector length can change when entering/leaving streaming mode.
8817 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8818 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8819 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8820 /*IsImplicit=*/true));
8821 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8822 /*IsImplicit=*/true));
8823 }
8824 }
8825
8826 if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8827 MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
8828 // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
8829 // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
8831 return;
8832
8833 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8834 MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
8835 TII->get(TargetOpcode::REG_SEQUENCE),
8836 MI.getOperand(0).getReg());
8837
8838 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8839 MIB.add(MI.getOperand(I));
8840 MIB.addImm(AArch64::zsub0 + (I - 1));
8841 }
8842
8843 MI.eraseFromParent();
8844 return;
8845 }
8846
8847 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8848 // have nothing to do with VG, were it not that they are used to materialise a
8849 // frame-address. If they contain a frame-index to a scalable vector, this
8850 // will likely require an ADDVL instruction to materialise the address, thus
8851 // reading VG.
8852 const MachineFunction &MF = *MI.getMF();
8854 (MI.getOpcode() == AArch64::ADDXri ||
8855 MI.getOpcode() == AArch64::SUBXri)) {
8856 const MachineOperand &MO = MI.getOperand(1);
8857 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8859 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8860 /*IsImplicit=*/true));
8861 }
8862}
8863
8865 bool Enable, SDValue Chain,
8866 SDValue InGlue,
8867 unsigned Condition,
8868 SDValue PStateSM) const {
8871 FuncInfo->setHasStreamingModeChanges(true);
8872
8873 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8874 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8875 SDValue MSROp =
8876 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8877 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8878 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8879 if (Condition != AArch64SME::Always) {
8880 assert(PStateSM && "PStateSM should be defined");
8881 Ops.push_back(PStateSM);
8882 }
8883 Ops.push_back(RegMask);
8884
8885 if (InGlue)
8886 Ops.push_back(InGlue);
8887
8888 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8889 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8890}
8891
8892// Emit a call to __arm_sme_save or __arm_sme_restore.
8894 SelectionDAG &DAG,
8896 SDValue Chain, bool IsSave) {
8899 FuncInfo->setSMESaveBufferUsed();
8900
8903 Entry.Ty = PointerType::getUnqual(*DAG.getContext());
8904 Entry.Node =
8905 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
8906 Args.push_back(Entry);
8907
8908 SDValue Callee =
8909 DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
8910 TLI.getPointerTy(DAG.getDataLayout()));
8911 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8913 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8915 Callee, std::move(Args));
8916 return TLI.LowerCallTo(CLI).second;
8917}
8918
8919static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8920 const SMEAttrs &CalleeAttrs) {
8921 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8922 CallerAttrs.hasStreamingBody())
8923 return AArch64SME::Always;
8924 if (CalleeAttrs.hasNonStreamingInterface())
8926 if (CalleeAttrs.hasStreamingInterface())
8928
8929 llvm_unreachable("Unsupported attributes");
8930}
8931
8932/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8933/// and add input and output parameter nodes.
8934SDValue
8935AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8936 SmallVectorImpl<SDValue> &InVals) const {
8937 SelectionDAG &DAG = CLI.DAG;
8938 SDLoc &DL = CLI.DL;
8939 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8940 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8942 SDValue Chain = CLI.Chain;
8943 SDValue Callee = CLI.Callee;
8944 bool &IsTailCall = CLI.IsTailCall;
8945 CallingConv::ID &CallConv = CLI.CallConv;
8946 bool IsVarArg = CLI.IsVarArg;
8947
8950 bool IsThisReturn = false;
8951
8953 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8954 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8955 bool IsSibCall = false;
8956 bool GuardWithBTI = false;
8957
8958 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8959 !Subtarget->noBTIAtReturnTwice()) {
8960 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8961 }
8962
8963 // Analyze operands of the call, assigning locations to each operand.
8965 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8966
8967 if (IsVarArg) {
8968 unsigned NumArgs = Outs.size();
8969
8970 for (unsigned i = 0; i != NumArgs; ++i) {
8971 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8972 report_fatal_error("Passing SVE types to variadic functions is "
8973 "currently not supported");
8974 }
8975 }
8976
8977 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8978
8979 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8980 // Assign locations to each value returned by this call.
8982 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8983 *DAG.getContext());
8984 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
8985
8986 // Check callee args/returns for SVE registers and set calling convention
8987 // accordingly.
8988 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8989 auto HasSVERegLoc = [](CCValAssign &Loc) {
8990 if (!Loc.isRegLoc())
8991 return false;
8992 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
8993 AArch64::PPRRegClass.contains(Loc.getLocReg());
8994 };
8995 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
8997 }
8998
8999 if (IsTailCall) {
9000 // Check if it's really possible to do a tail call.
9001 IsTailCall = isEligibleForTailCallOptimization(CLI);
9002
9003 // A sibling call is one where we're under the usual C ABI and not planning
9004 // to change that but can still do a tail call:
9005 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
9006 CallConv != CallingConv::SwiftTail)
9007 IsSibCall = true;
9008
9009 if (IsTailCall)
9010 ++NumTailCalls;
9011 }
9012
9013 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9014 report_fatal_error("failed to perform tail call elimination on a call "
9015 "site marked musttail");
9016
9017 // Get a count of how many bytes are to be pushed on the stack.
9018 unsigned NumBytes = CCInfo.getStackSize();
9019
9020 if (IsSibCall) {
9021 // Since we're not changing the ABI to make this a tail call, the memory
9022 // operands are already available in the caller's incoming argument space.
9023 NumBytes = 0;
9024 }
9025
9026 // FPDiff is the byte offset of the call's argument area from the callee's.
9027 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9028 // by this amount for a tail call. In a sibling call it must be 0 because the
9029 // caller will deallocate the entire stack and the callee still expects its
9030 // arguments to begin at SP+0. Completely unused for non-tail calls.
9031 int FPDiff = 0;
9032
9033 if (IsTailCall && !IsSibCall) {
9034 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9035
9036 // Since callee will pop argument stack as a tail call, we must keep the
9037 // popped size 16-byte aligned.
9038 NumBytes = alignTo(NumBytes, 16);
9039
9040 // FPDiff will be negative if this tail call requires more space than we
9041 // would automatically have in our incoming argument space. Positive if we
9042 // can actually shrink the stack.
9043 FPDiff = NumReusableBytes - NumBytes;
9044
9045 // Update the required reserved area if this is the tail call requiring the
9046 // most argument stack space.
9047 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9048 FuncInfo->setTailCallReservedStack(-FPDiff);
9049
9050 // The stack pointer must be 16-byte aligned at all times it's used for a
9051 // memory operation, which in practice means at *all* times and in
9052 // particular across call boundaries. Therefore our own arguments started at
9053 // a 16-byte aligned SP and the delta applied for the tail call should
9054 // satisfy the same constraint.
9055 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9056 }
9057
9058 // Determine whether we need any streaming mode changes.
9059 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
9060 if (CLI.CB)
9061 CalleeAttrs = SMEAttrs(*CLI.CB);
9062 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9063 CalleeAttrs = SMEAttrs(ES->getSymbol());
9064
9065 auto DescribeCallsite =
9067 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9068 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9069 R << ore::NV("Callee", ES->getSymbol());
9070 else if (CLI.CB && CLI.CB->getCalledFunction())
9071 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9072 else
9073 R << "unknown callee";
9074 R << "'";
9075 return R;
9076 };
9077
9078 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
9079 bool RequiresSaveAllZA =
9080 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs);
9081 if (RequiresLazySave) {
9082 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9083 MachinePointerInfo MPI =
9085 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9086 TPIDR2.FrameIndex,
9088 SDValue NumZaSaveSlicesAddr =
9089 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
9090 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
9091 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9092 DAG.getConstant(1, DL, MVT::i32));
9093 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
9094 MPI, MVT::i16);
9095 Chain = DAG.getNode(
9096 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9097 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9098 TPIDR2ObjAddr);
9100 ORE.emit([&]() {
9101 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9102 CLI.CB)
9103 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9104 &MF.getFunction());
9105 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9106 });
9107 } else if (RequiresSaveAllZA) {
9108 assert(!CalleeAttrs.hasSharedZAInterface() &&
9109 "Cannot share state that may not exist");
9110 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9111 /*IsSave=*/true);
9112 }
9113
9114 SDValue PStateSM;
9115 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
9116 if (RequiresSMChange) {
9117 if (CallerAttrs.hasStreamingInterfaceOrBody())
9118 PStateSM = DAG.getConstant(1, DL, MVT::i64);
9119 else if (CallerAttrs.hasNonStreamingInterface())
9120 PStateSM = DAG.getConstant(0, DL, MVT::i64);
9121 else
9122 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
9124 ORE.emit([&]() {
9125 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9126 CLI.CB)
9127 : OptimizationRemarkAnalysis("sme", "SMETransition",
9128 &MF.getFunction());
9129 DescribeCallsite(R) << " requires a streaming mode transition";
9130 return R;
9131 });
9132 }
9133
9134 SDValue ZTFrameIdx;
9135 MachineFrameInfo &MFI = MF.getFrameInfo();
9136 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
9137
9138 // If the caller has ZT0 state which will not be preserved by the callee,
9139 // spill ZT0 before the call.
9140 if (ShouldPreserveZT0) {
9141 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
9142 ZTFrameIdx = DAG.getFrameIndex(
9143 ZTObj,
9145
9146 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9147 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9148 }
9149
9150 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9151 // PSTATE.ZA before the call if there is no lazy-save active.
9152 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
9153 assert((!DisableZA || !RequiresLazySave) &&
9154 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9155
9156 if (DisableZA)
9157 Chain = DAG.getNode(
9158 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
9159 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9160 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9161
9162 // Adjust the stack pointer for the new arguments...
9163 // These operations are automatically eliminated by the prolog/epilog pass
9164 if (!IsSibCall)
9165 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9166
9167 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9169
9171 SmallSet<unsigned, 8> RegsUsed;
9172 SmallVector<SDValue, 8> MemOpChains;
9173 auto PtrVT = getPointerTy(DAG.getDataLayout());
9174
9175 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9176 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9177 for (const auto &F : Forwards) {
9178 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9179 RegsToPass.emplace_back(F.PReg, Val);
9180 }
9181 }
9182
9183 // Walk the register/memloc assignments, inserting copies/loads.
9184 unsigned ExtraArgLocs = 0;
9185 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9186 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9187 SDValue Arg = OutVals[i];
9188 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9189
9190 // Promote the value if needed.
9191 switch (VA.getLocInfo()) {
9192 default:
9193 llvm_unreachable("Unknown loc info!");
9194 case CCValAssign::Full:
9195 break;
9196 case CCValAssign::SExt:
9197 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9198 break;
9199 case CCValAssign::ZExt:
9200 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9201 break;
9202 case CCValAssign::AExt:
9203 if (Outs[i].ArgVT == MVT::i1) {
9204 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9205 //
9206 // Check if we actually have to do this, because the value may
9207 // already be zero-extended.
9208 //
9209 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9210 // and rely on DAGCombiner to fold this, because the following
9211 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9212 //
9213 // (ext (zext x)) -> (zext x)
9214 //
9215 // This will give us (zext i32), which we cannot remove, so
9216 // try to check this beforehand.
9217 if (!checkZExtBool(Arg, DAG)) {
9218 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9219 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9220 }
9221 }
9222 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9223 break;
9225 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9226 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9227 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9228 DAG.getConstant(32, DL, VA.getLocVT()));
9229 break;
9230 case CCValAssign::BCvt:
9231 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9232 break;
9233 case CCValAssign::Trunc:
9234 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9235 break;
9236 case CCValAssign::FPExt:
9237 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9238 break;
9240 bool isScalable = VA.getValVT().isScalableVT();
9241 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9242 "Indirect arguments should be scalable on most subtargets");
9243
9244 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9245 uint64_t PartSize = StoreSize;
9246 unsigned NumParts = 1;
9247 if (Outs[i].Flags.isInConsecutiveRegs()) {
9248 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9249 ++NumParts;
9250 StoreSize *= NumParts;
9251 }
9252
9253 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9254 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9255 MachineFrameInfo &MFI = MF.getFrameInfo();
9256 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9257 if (isScalable)
9259
9263 SDValue SpillSlot = Ptr;
9264
9265 // Ensure we generate all stores for each tuple part, whilst updating the
9266 // pointer after each store correctly using vscale.
9267 while (NumParts) {
9268 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9269 MemOpChains.push_back(Store);
9270
9271 NumParts--;
9272 if (NumParts > 0) {
9273 SDValue BytesIncrement;
9274 if (isScalable) {
9275 BytesIncrement = DAG.getVScale(
9276 DL, Ptr.getValueType(),
9277 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9278 } else {
9279 BytesIncrement = DAG.getConstant(
9280 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9281 Ptr.getValueType());
9282 }
9283 MPI = MachinePointerInfo(MPI.getAddrSpace());
9284 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9285 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9286 ExtraArgLocs++;
9287 i++;
9288 }
9289 }
9290
9291 Arg = SpillSlot;
9292 break;
9293 }
9294
9295 if (VA.isRegLoc()) {
9296 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9297 Outs[0].VT == MVT::i64) {
9298 assert(VA.getLocVT() == MVT::i64 &&
9299 "unexpected calling convention register assignment");
9300 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9301 "unexpected use of 'returned'");
9302 IsThisReturn = true;
9303 }
9304 if (RegsUsed.count(VA.getLocReg())) {
9305 // If this register has already been used then we're trying to pack
9306 // parts of an [N x i32] into an X-register. The extension type will
9307 // take care of putting the two halves in the right place but we have to
9308 // combine them.
9309 SDValue &Bits =
9310 llvm::find_if(RegsToPass,
9311 [=](const std::pair<unsigned, SDValue> &Elt) {
9312 return Elt.first == VA.getLocReg();
9313 })
9314 ->second;
9315 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9316 // Call site info is used for function's parameter entry value
9317 // tracking. For now we track only simple cases when parameter
9318 // is transferred through whole register.
9320 [&VA](MachineFunction::ArgRegPair ArgReg) {
9321 return ArgReg.Reg == VA.getLocReg();
9322 });
9323 } else {
9324 // Add an extra level of indirection for streaming mode changes by
9325 // using a pseudo copy node that cannot be rematerialised between a
9326 // smstart/smstop and the call by the simple register coalescer.
9327 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9329 Arg.getValueType(), Arg);
9330 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9331 RegsUsed.insert(VA.getLocReg());
9332 const TargetOptions &Options = DAG.getTarget().Options;
9333 if (Options.EmitCallSiteInfo)
9334 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9335 }
9336 } else {
9337 assert(VA.isMemLoc());
9338
9339 SDValue DstAddr;
9340 MachinePointerInfo DstInfo;
9341
9342 // FIXME: This works on big-endian for composite byvals, which are the
9343 // common case. It should also work for fundamental types too.
9344 uint32_t BEAlign = 0;
9345 unsigned OpSize;
9346 if (VA.getLocInfo() == CCValAssign::Indirect ||
9348 OpSize = VA.getLocVT().getFixedSizeInBits();
9349 else
9350 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9351 : VA.getValVT().getSizeInBits();
9352 OpSize = (OpSize + 7) / 8;
9353 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9354 !Flags.isInConsecutiveRegs()) {
9355 if (OpSize < 8)
9356 BEAlign = 8 - OpSize;
9357 }
9358 unsigned LocMemOffset = VA.getLocMemOffset();
9359 int32_t Offset = LocMemOffset + BEAlign;
9360 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9361 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9362
9363 if (IsTailCall) {
9364 Offset = Offset + FPDiff;
9365 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9366
9367 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9368 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9369
9370 // Make sure any stack arguments overlapping with where we're storing
9371 // are loaded before this eventual operation. Otherwise they'll be
9372 // clobbered.
9373 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9374 } else {
9375 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9376
9377 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9378 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9379 }
9380
9381 if (Outs[i].Flags.isByVal()) {
9382 SDValue SizeNode =
9383 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9384 SDValue Cpy = DAG.getMemcpy(
9385 Chain, DL, DstAddr, Arg, SizeNode,
9386 Outs[i].Flags.getNonZeroByValAlign(),
9387 /*isVol = */ false, /*AlwaysInline = */ false,
9388 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9389
9390 MemOpChains.push_back(Cpy);
9391 } else {
9392 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9393 // promoted to a legal register type i32, we should truncate Arg back to
9394 // i1/i8/i16.
9395 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9396 VA.getValVT() == MVT::i16)
9397 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9398
9399 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9400 MemOpChains.push_back(Store);
9401 }
9402 }
9403 }
9404
9405 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
9406 SDValue ParamPtr = StackPtr;
9407 if (IsTailCall) {
9408 // Create a dummy object at the top of the stack that can be used to get
9409 // the SP after the epilogue
9410 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9411 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9412 }
9413
9414 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9415 // describing the argument list. x4 contains the address of the
9416 // first stack parameter. x5 contains the size in bytes of all parameters
9417 // passed on the stack.
9418 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9419 RegsToPass.emplace_back(AArch64::X5,
9420 DAG.getConstant(NumBytes, DL, MVT::i64));
9421 }
9422
9423 if (!MemOpChains.empty())
9424 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9425
9426 SDValue InGlue;
9427 if (RequiresSMChange) {
9428 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9429 Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
9430 DAG.getVTList(MVT::Other, MVT::Glue), Chain);
9431 InGlue = Chain.getValue(1);
9432 }
9433
9434 SDValue NewChain = changeStreamingMode(
9435 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
9436 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9437 Chain = NewChain.getValue(0);
9438 InGlue = NewChain.getValue(1);
9439 }
9440
9441 // Build a sequence of copy-to-reg nodes chained together with token chain
9442 // and flag operands which copy the outgoing args into the appropriate regs.
9443 for (auto &RegToPass : RegsToPass) {
9444 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9445 RegToPass.second, InGlue);
9446 InGlue = Chain.getValue(1);
9447 }
9448
9449 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9450 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9451 // node so that legalize doesn't hack it.
9452 const GlobalValue *CalledGlobal = nullptr;
9453 unsigned OpFlags = 0;
9454 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9455 CalledGlobal = G->getGlobal();
9456 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9458 if (OpFlags & AArch64II::MO_GOT) {
9459 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9460 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9461 } else {
9462 const GlobalValue *GV = G->getGlobal();
9463 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9464 }
9465 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9466 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9467 Subtarget->isTargetMachO()) ||
9469 const char *Sym = S->getSymbol();
9470 if (UseGot) {
9472 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9473 } else {
9474 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9475 }
9476 }
9477
9478 // We don't usually want to end the call-sequence here because we would tidy
9479 // the frame up *after* the call, however in the ABI-changing tail-call case
9480 // we've carefully laid out the parameters so that when sp is reset they'll be
9481 // in the correct location.
9482 if (IsTailCall && !IsSibCall) {
9483 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9484 InGlue = Chain.getValue(1);
9485 }
9486
9487 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9488
9489 std::vector<SDValue> Ops;
9490 Ops.push_back(Chain);
9491 Ops.push_back(Callee);
9492
9493 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9494 // be expanded to the call, directly followed by a special marker sequence and
9495 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9496 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9497 assert(!IsTailCall &&
9498 "tail calls cannot be marked with clang.arc.attachedcall");
9500
9501 // Add a target global address for the retainRV/claimRV runtime function
9502 // just before the call target.
9503 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9504 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9505 Ops.insert(Ops.begin() + 1, GA);
9506 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9508 } else if (GuardWithBTI) {
9510 }
9511
9512 if (IsTailCall) {
9513 // Each tail call may have to adjust the stack by a different amount, so
9514 // this information must travel along with the operation for eventual
9515 // consumption by emitEpilogue.
9516 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9517 }
9518
9519 if (CLI.PAI) {
9520 const uint64_t Key = CLI.PAI->Key;
9521 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9522 "Invalid auth call key");
9523
9524 // Split the discriminator into address/integer components.
9525 SDValue AddrDisc, IntDisc;
9526 std::tie(IntDisc, AddrDisc) =
9527 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9528
9529 if (Opc == AArch64ISD::CALL_RVMARKER)
9531 else
9533 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9534 Ops.push_back(IntDisc);
9535 Ops.push_back(AddrDisc);
9536 }
9537
9538 // Add argument registers to the end of the list so that they are known live
9539 // into the call.
9540 for (auto &RegToPass : RegsToPass)
9541 Ops.push_back(DAG.getRegister(RegToPass.first,
9542 RegToPass.second.getValueType()));
9543
9544 // Add a register mask operand representing the call-preserved registers.
9545 const uint32_t *Mask;
9546 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9547 if (IsThisReturn) {
9548 // For 'this' returns, use the X0-preserving mask if applicable
9549 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9550 if (!Mask) {
9551 IsThisReturn = false;
9552 Mask = TRI->getCallPreservedMask(MF, CallConv);
9553 }
9554 } else
9555 Mask = TRI->getCallPreservedMask(MF, CallConv);
9556
9557 if (Subtarget->hasCustomCallingConv())
9558 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9559
9560 if (TRI->isAnyArgRegReserved(MF))
9561 TRI->emitReservedArgRegCallError(MF);
9562
9563 assert(Mask && "Missing call preserved mask for calling convention");
9564 Ops.push_back(DAG.getRegisterMask(Mask));
9565
9566 if (InGlue.getNode())
9567 Ops.push_back(InGlue);
9568
9569 // If we're doing a tall call, use a TC_RETURN here rather than an
9570 // actual call instruction.
9571 if (IsTailCall) {
9573 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9574 if (IsCFICall)
9575 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9576
9577 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9578 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9579 if (CalledGlobal)
9580 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9581 return Ret;
9582 }
9583
9584 // Returns a chain and a flag for retval copy to use.
9585 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9586 if (IsCFICall)
9587 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9588
9589 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9590 InGlue = Chain.getValue(1);
9591 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9592 if (CalledGlobal)
9593 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9594
9595 uint64_t CalleePopBytes =
9596 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9597
9598 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9599 InGlue = Chain.getValue(1);
9600
9601 // Handle result values, copying them out of physregs into vregs that we
9602 // return.
9603 SDValue Result = LowerCallResult(
9604 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9605 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9606
9607 if (!Ins.empty())
9608 InGlue = Result.getValue(Result->getNumValues() - 1);
9609
9610 if (RequiresSMChange) {
9611 assert(PStateSM && "Expected a PStateSM to be set");
9613 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
9614 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9615
9616 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9617 InGlue = Result.getValue(1);
9618 Result =
9620 DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
9621 }
9622 }
9623
9624 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
9625 // Unconditionally resume ZA.
9626 Result = DAG.getNode(
9627 AArch64ISD::SMSTART, DL, MVT::Other, Result,
9628 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9629 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9630
9631 if (ShouldPreserveZT0)
9632 Result =
9633 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
9634 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9635
9636 if (RequiresLazySave) {
9637 // Conditionally restore the lazy save using a pseudo node.
9638 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9639 SDValue RegMask = DAG.getRegisterMask(
9640 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
9641 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9642 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
9643 SDValue TPIDR2_EL0 = DAG.getNode(
9644 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
9645 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
9646
9647 // Copy the address of the TPIDR2 block into X0 before 'calling' the
9648 // RESTORE_ZA pseudo.
9649 SDValue Glue;
9650 SDValue TPIDR2Block = DAG.getFrameIndex(
9651 TPIDR2.FrameIndex,
9653 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
9654 Result =
9655 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
9656 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
9657 RestoreRoutine, RegMask, Result.getValue(1)});
9658
9659 // Finally reset the TPIDR2_EL0 register to 0.
9660 Result = DAG.getNode(
9661 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
9662 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9663 DAG.getConstant(0, DL, MVT::i64));
9664 TPIDR2.Uses++;
9665 } else if (RequiresSaveAllZA) {
9666 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
9667 /*IsSave=*/false);
9668 }
9669
9670 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9671 RequiresSaveAllZA) {
9672 for (unsigned I = 0; I < InVals.size(); ++I) {
9673 // The smstart/smstop is chained as part of the call, but when the
9674 // resulting chain is discarded (which happens when the call is not part
9675 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9676 // smstart/smstop is chained to the result value. We can do that by doing
9677 // a vreg -> vreg copy.
9680 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
9681 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
9682 InVals[I].getValueType());
9683 }
9684 }
9685
9686 if (CallConv == CallingConv::PreserveNone) {
9687 for (const ISD::OutputArg &O : Outs) {
9688 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9689 O.Flags.isSwiftAsync()) {
9692 MF.getFunction(),
9693 "Swift attributes can't be used with preserve_none",
9694 DL.getDebugLoc()));
9695 break;
9696 }
9697 }
9698 }
9699
9700 return Result;
9701}
9702
9703bool AArch64TargetLowering::CanLowerReturn(
9704 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9705 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
9706 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9708 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9709 return CCInfo.CheckReturn(Outs, RetCC);
9710}
9711
9712SDValue
9713AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9714 bool isVarArg,
9716 const SmallVectorImpl<SDValue> &OutVals,
9717 const SDLoc &DL, SelectionDAG &DAG) const {
9718 auto &MF = DAG.getMachineFunction();
9719 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9720
9721 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9723 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9724 CCInfo.AnalyzeReturn(Outs, RetCC);
9725
9726 // Copy the result values into the output registers.
9727 SDValue Glue;
9729 SmallSet<unsigned, 4> RegsUsed;
9730 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9731 ++i, ++realRVLocIdx) {
9732 CCValAssign &VA = RVLocs[i];
9733 assert(VA.isRegLoc() && "Can only return in registers!");
9734 SDValue Arg = OutVals[realRVLocIdx];
9735
9736 switch (VA.getLocInfo()) {
9737 default:
9738 llvm_unreachable("Unknown loc info!");
9739 case CCValAssign::Full:
9740 if (Outs[i].ArgVT == MVT::i1) {
9741 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9742 // value. This is strictly redundant on Darwin (which uses "zeroext
9743 // i1"), but will be optimised out before ISel.
9744 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9745 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9746 }
9747 break;
9748 case CCValAssign::BCvt:
9749 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9750 break;
9751 case CCValAssign::AExt:
9752 case CCValAssign::ZExt:
9753 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9754 break;
9756 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9757 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9758 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9759 DAG.getConstant(32, DL, VA.getLocVT()));
9760 break;
9761 }
9762
9763 if (RegsUsed.count(VA.getLocReg())) {
9764 SDValue &Bits =
9765 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
9766 return Elt.first == VA.getLocReg();
9767 })->second;
9768 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9769 } else {
9770 RetVals.emplace_back(VA.getLocReg(), Arg);
9771 RegsUsed.insert(VA.getLocReg());
9772 }
9773 }
9774
9775 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9776
9777 // Emit SMSTOP before returning from a locally streaming function
9778 SMEAttrs FuncAttrs(MF.getFunction());
9779 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9780 if (FuncAttrs.hasStreamingCompatibleInterface()) {
9781 Register Reg = FuncInfo->getPStateSMReg();
9782 assert(Reg.isValid() && "PStateSM Register is invalid");
9783 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
9784 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9785 /*Glue*/ SDValue(),
9787 } else
9788 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9789 /*Glue*/ SDValue(), AArch64SME::Always);
9790 Glue = Chain.getValue(1);
9791 }
9792
9793 SmallVector<SDValue, 4> RetOps(1, Chain);
9794 for (auto &RetVal : RetVals) {
9795 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9796 isPassedInFPR(RetVal.second.getValueType()))
9797 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9798 RetVal.second.getValueType(), RetVal.second);
9799 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
9800 Glue = Chain.getValue(1);
9801 RetOps.push_back(
9802 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
9803 }
9804
9805 // Windows AArch64 ABIs require that for returning structs by value we copy
9806 // the sret argument into X0 for the return.
9807 // We saved the argument into a virtual register in the entry block,
9808 // so now we copy the value out and into X0.
9809 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9810 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
9812
9813 unsigned RetValReg = AArch64::X0;
9814 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9815 RetValReg = AArch64::X8;
9816 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9817 Glue = Chain.getValue(1);
9818
9819 RetOps.push_back(
9820 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9821 }
9822
9823 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9824 if (I) {
9825 for (; *I; ++I) {
9826 if (AArch64::GPR64RegClass.contains(*I))
9827 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9828 else if (AArch64::FPR64RegClass.contains(*I))
9829 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9830 else
9831 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9832 }
9833 }
9834
9835 RetOps[0] = Chain; // Update chain.
9836
9837 // Add the glue if we have it.
9838 if (Glue.getNode())
9839 RetOps.push_back(Glue);
9840
9841 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9842 // ARM64EC entry thunks use a special return sequence: instead of a regular
9843 // "ret" instruction, they need to explicitly call the emulator.
9844 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9845 SDValue Arm64ECRetDest =
9846 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9847 Arm64ECRetDest =
9848 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9849 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9851 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9852 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9853 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9854 }
9855
9856 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9857}
9858
9859//===----------------------------------------------------------------------===//
9860// Other Lowering Code
9861//===----------------------------------------------------------------------===//
9862
9863SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9864 SelectionDAG &DAG,
9865 unsigned Flag) const {
9866 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9867 N->getOffset(), Flag);
9868}
9869
9870SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9871 SelectionDAG &DAG,
9872 unsigned Flag) const {
9873 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9874}
9875
9876SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9877 SelectionDAG &DAG,
9878 unsigned Flag) const {
9879 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9880 N->getOffset(), Flag);
9881}
9882
9883SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9884 SelectionDAG &DAG,
9885 unsigned Flag) const {
9886 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9887}
9888
9889SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9890 SelectionDAG &DAG,
9891 unsigned Flag) const {
9892 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9893}
9894
9895// (loadGOT sym)
9896template <class NodeTy>
9897SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9898 unsigned Flags) const {
9899 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9900 SDLoc DL(N);
9901 EVT Ty = getPointerTy(DAG.getDataLayout());
9902 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9903 // FIXME: Once remat is capable of dealing with instructions with register
9904 // operands, expand this into two nodes instead of using a wrapper node.
9905 if (DAG.getMachineFunction()
9907 ->hasELFSignedGOT())
9908 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
9909 0);
9910 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9911}
9912
9913// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9914template <class NodeTy>
9915SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9916 unsigned Flags) const {
9917 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9918 SDLoc DL(N);
9919 EVT Ty = getPointerTy(DAG.getDataLayout());
9920 const unsigned char MO_NC = AArch64II::MO_NC;
9921 return DAG.getNode(
9923 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9924 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9925 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9926 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9927}
9928
9929// (addlow (adrp %hi(sym)) %lo(sym))
9930template <class NodeTy>
9931SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9932 unsigned Flags) const {
9933 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9934 SDLoc DL(N);
9935 EVT Ty = getPointerTy(DAG.getDataLayout());
9936 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9937 SDValue Lo = getTargetNode(N, Ty, DAG,
9940 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9941}
9942
9943// (adr sym)
9944template <class NodeTy>
9945SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9946 unsigned Flags) const {
9947 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9948 SDLoc DL(N);
9949 EVT Ty = getPointerTy(DAG.getDataLayout());
9950 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9951 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9952}
9953
9954SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9955 SelectionDAG &DAG) const {
9956 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9957 const GlobalValue *GV = GN->getGlobal();
9958 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9959
9960 if (OpFlags != AArch64II::MO_NO_FLAG)
9961 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9962 "unexpected offset in global node");
9963
9964 // This also catches the large code model case for Darwin, and tiny code
9965 // model with got relocations.
9966 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9967 return getGOT(GN, DAG, OpFlags);
9968 }
9969
9973 Result = getAddrLarge(GN, DAG, OpFlags);
9974 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9975 Result = getAddrTiny(GN, DAG, OpFlags);
9976 } else {
9977 Result = getAddr(GN, DAG, OpFlags);
9978 }
9979 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9980 SDLoc DL(GN);
9982 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
9984 return Result;
9985}
9986
9987/// Convert a TLS address reference into the correct sequence of loads
9988/// and calls to compute the variable's address (for Darwin, currently) and
9989/// return an SDValue containing the final node.
9990
9991/// Darwin only has one TLS scheme which must be capable of dealing with the
9992/// fully general situation, in the worst case. This means:
9993/// + "extern __thread" declaration.
9994/// + Defined in a possibly unknown dynamic library.
9995///
9996/// The general system is that each __thread variable has a [3 x i64] descriptor
9997/// which contains information used by the runtime to calculate the address. The
9998/// only part of this the compiler needs to know about is the first xword, which
9999/// contains a function pointer that must be called with the address of the
10000/// entire descriptor in "x0".
10001///
10002/// Since this descriptor may be in a different unit, in general even the
10003/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10004/// is:
10005/// adrp x0, _var@TLVPPAGE
10006/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10007/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10008/// ; the function pointer
10009/// blr x1 ; Uses descriptor address in x0
10010/// ; Address of _var is now in x0.
10011///
10012/// If the address of _var's descriptor *is* known to the linker, then it can
10013/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10014/// a slight efficiency gain.
10015SDValue
10016AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10017 SelectionDAG &DAG) const {
10018 assert(Subtarget->isTargetDarwin() &&
10019 "This function expects a Darwin target");
10020
10021 SDLoc DL(Op);
10022 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10023 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10024 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10025
10026 SDValue TLVPAddr =
10027 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10028 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10029
10030 // The first entry in the descriptor is a function pointer that we must call
10031 // to obtain the address of the variable.
10032 SDValue Chain = DAG.getEntryNode();
10033 SDValue FuncTLVGet = DAG.getLoad(
10034 PtrMemVT, DL, Chain, DescAddr,
10036 Align(PtrMemVT.getSizeInBits() / 8),
10038 Chain = FuncTLVGet.getValue(1);
10039
10040 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10041 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10042
10044 MFI.setAdjustsStack(true);
10045
10046 // TLS calls preserve all registers except those that absolutely must be
10047 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10048 // silly).
10049 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10050 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10051 if (Subtarget->hasCustomCallingConv())
10052 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10053
10054 // Finally, we can make the call. This is just a degenerate version of a
10055 // normal AArch64 call node: x0 takes the address of the descriptor, and
10056 // returns the address of the variable in this thread.
10057 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10058
10059 unsigned Opcode = AArch64ISD::CALL;
10061 Ops.push_back(Chain);
10062 Ops.push_back(FuncTLVGet);
10063
10064 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10065 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10066 Opcode = AArch64ISD::AUTH_CALL;
10067 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10068 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10069 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10070 }
10071
10072 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10073 Ops.push_back(DAG.getRegisterMask(Mask));
10074 Ops.push_back(Chain.getValue(1));
10075 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10076 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10077}
10078
10079/// Convert a thread-local variable reference into a sequence of instructions to
10080/// compute the variable's address for the local exec TLS model of ELF targets.
10081/// The sequence depends on the maximum TLS area size.
10082SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10083 SDValue ThreadBase,
10084 const SDLoc &DL,
10085 SelectionDAG &DAG) const {
10086 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10087 SDValue TPOff, Addr;
10088
10089 switch (DAG.getTarget().Options.TLSSize) {
10090 default:
10091 llvm_unreachable("Unexpected TLS size");
10092
10093 case 12: {
10094 // mrs x0, TPIDR_EL0
10095 // add x0, x0, :tprel_lo12:a
10097 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10098 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10099 Var,
10100 DAG.getTargetConstant(0, DL, MVT::i32)),
10101 0);
10102 }
10103
10104 case 24: {
10105 // mrs x0, TPIDR_EL0
10106 // add x0, x0, :tprel_hi12:a
10107 // add x0, x0, :tprel_lo12_nc:a
10108 SDValue HiVar = DAG.getTargetGlobalAddress(
10109 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10110 SDValue LoVar = DAG.getTargetGlobalAddress(
10111 GV, DL, PtrVT, 0,
10113 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10114 HiVar,
10115 DAG.getTargetConstant(0, DL, MVT::i32)),
10116 0);
10117 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10118 LoVar,
10119 DAG.getTargetConstant(0, DL, MVT::i32)),
10120 0);
10121 }
10122
10123 case 32: {
10124 // mrs x1, TPIDR_EL0
10125 // movz x0, #:tprel_g1:a
10126 // movk x0, #:tprel_g0_nc:a
10127 // add x0, x1, x0
10128 SDValue HiVar = DAG.getTargetGlobalAddress(
10129 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10130 SDValue LoVar = DAG.getTargetGlobalAddress(
10131 GV, DL, PtrVT, 0,
10133 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10134 DAG.getTargetConstant(16, DL, MVT::i32)),
10135 0);
10136 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10137 DAG.getTargetConstant(0, DL, MVT::i32)),
10138 0);
10139 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10140 }
10141
10142 case 48: {
10143 // mrs x1, TPIDR_EL0
10144 // movz x0, #:tprel_g2:a
10145 // movk x0, #:tprel_g1_nc:a
10146 // movk x0, #:tprel_g0_nc:a
10147 // add x0, x1, x0
10148 SDValue HiVar = DAG.getTargetGlobalAddress(
10149 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10150 SDValue MiVar = DAG.getTargetGlobalAddress(
10151 GV, DL, PtrVT, 0,
10153 SDValue LoVar = DAG.getTargetGlobalAddress(
10154 GV, DL, PtrVT, 0,
10156 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10157 DAG.getTargetConstant(32, DL, MVT::i32)),
10158 0);
10159 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10160 DAG.getTargetConstant(16, DL, MVT::i32)),
10161 0);
10162 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10163 DAG.getTargetConstant(0, DL, MVT::i32)),
10164 0);
10165 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10166 }
10167 }
10168}
10169
10170/// When accessing thread-local variables under either the general-dynamic or
10171/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10172/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10173/// is a function pointer to carry out the resolution.
10174///
10175/// The sequence is:
10176/// adrp x0, :tlsdesc:var
10177/// ldr x1, [x0, #:tlsdesc_lo12:var]
10178/// add x0, x0, #:tlsdesc_lo12:var
10179/// .tlsdesccall var
10180/// blr x1
10181/// (TPIDR_EL0 offset now in x0)
10182///
10183/// The above sequence must be produced unscheduled, to enable the linker to
10184/// optimize/relax this sequence.
10185/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10186/// above sequence, and expanded really late in the compilation flow, to ensure
10187/// the sequence is produced as per above.
10188SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10189 const SDLoc &DL,
10190 SelectionDAG &DAG) const {
10191 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10192
10193 SDValue Chain = DAG.getEntryNode();
10194 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10195
10196 unsigned Opcode =
10197 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10200 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10201 SDValue Glue = Chain.getValue(1);
10202
10203 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10204}
10205
10206SDValue
10207AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10208 SelectionDAG &DAG) const {
10209 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10210
10211 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10212 AArch64FunctionInfo *MFI =
10214
10218
10220 if (Model == TLSModel::LocalDynamic)
10222 }
10223
10225 Model != TLSModel::LocalExec)
10226 report_fatal_error("ELF TLS only supported in small memory model or "
10227 "in local exec TLS model");
10228 // Different choices can be made for the maximum size of the TLS area for a
10229 // module. For the small address model, the default TLS size is 16MiB and the
10230 // maximum TLS size is 4GiB.
10231 // FIXME: add tiny and large code model support for TLS access models other
10232 // than local exec. We currently generate the same code as small for tiny,
10233 // which may be larger than needed.
10234
10235 SDValue TPOff;
10236 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10237 SDLoc DL(Op);
10238 const GlobalValue *GV = GA->getGlobal();
10239
10240 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10241
10242 if (Model == TLSModel::LocalExec) {
10243 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10244 } else if (Model == TLSModel::InitialExec) {
10245 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10246 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10247 } else if (Model == TLSModel::LocalDynamic) {
10248 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10249 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10250 // the beginning of the module's TLS region, followed by a DTPREL offset
10251 // calculation.
10252
10253 // These accesses will need deduplicating if there's more than one.
10255
10256 // The call needs a relocation too for linker relaxation. It doesn't make
10257 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10258 // the address.
10259 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10261
10262 // Now we can calculate the offset from TPIDR_EL0 to this module's
10263 // thread-local area.
10264 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10265
10266 // Now use :dtprel_whatever: operations to calculate this variable's offset
10267 // in its thread-storage area.
10268 SDValue HiVar = DAG.getTargetGlobalAddress(
10269 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10270 SDValue LoVar = DAG.getTargetGlobalAddress(
10271 GV, DL, MVT::i64, 0,
10273
10274 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10275 DAG.getTargetConstant(0, DL, MVT::i32)),
10276 0);
10277 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10278 DAG.getTargetConstant(0, DL, MVT::i32)),
10279 0);
10280 } else if (Model == TLSModel::GeneralDynamic) {
10281 // The call needs a relocation too for linker relaxation. It doesn't make
10282 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10283 // the address.
10284 SDValue SymAddr =
10285 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10286
10287 // Finally we can make a call to calculate the offset from tpidr_el0.
10288 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10289 } else
10290 llvm_unreachable("Unsupported ELF TLS access model");
10291
10292 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10293}
10294
10295SDValue
10296AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10297 SelectionDAG &DAG) const {
10298 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10299
10300 SDValue Chain = DAG.getEntryNode();
10301 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10302 SDLoc DL(Op);
10303
10304 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10305
10306 // Load the ThreadLocalStoragePointer from the TEB
10307 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10308 SDValue TLSArray =
10309 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10310 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10311 Chain = TLSArray.getValue(1);
10312
10313 // Load the TLS index from the C runtime;
10314 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10315 // This also does the same as LOADgot, but using a generic i32 load,
10316 // while LOADgot only loads i64.
10317 SDValue TLSIndexHi =
10318 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10319 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10320 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10321 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10322 SDValue TLSIndex =
10323 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10324 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10325 Chain = TLSIndex.getValue(1);
10326
10327 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10328 // offset into the TLSArray.
10329 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10330 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10331 DAG.getConstant(3, DL, PtrVT));
10332 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10333 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10335 Chain = TLS.getValue(1);
10336
10337 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10338 const GlobalValue *GV = GA->getGlobal();
10339 SDValue TGAHi = DAG.getTargetGlobalAddress(
10340 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10341 SDValue TGALo = DAG.getTargetGlobalAddress(
10342 GV, DL, PtrVT, 0,
10344
10345 // Add the offset from the start of the .tls section (section base).
10346 SDValue Addr =
10347 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10348 DAG.getTargetConstant(0, DL, MVT::i32)),
10349 0);
10350 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10351 return Addr;
10352}
10353
10354SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10355 SelectionDAG &DAG) const {
10356 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10357 if (DAG.getTarget().useEmulatedTLS())
10358 return LowerToTLSEmulatedModel(GA, DAG);
10359
10360 if (Subtarget->isTargetDarwin())
10361 return LowerDarwinGlobalTLSAddress(Op, DAG);
10362 if (Subtarget->isTargetELF())
10363 return LowerELFGlobalTLSAddress(Op, DAG);
10364 if (Subtarget->isTargetWindows())
10365 return LowerWindowsGlobalTLSAddress(Op, DAG);
10366
10367 llvm_unreachable("Unexpected platform trying to use TLS");
10368}
10369
10370//===----------------------------------------------------------------------===//
10371// PtrAuthGlobalAddress lowering
10372//
10373// We have 3 lowering alternatives to choose from:
10374// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10375// If the GV doesn't need a GOT load (i.e., is locally defined)
10376// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10377//
10378// - LOADgotPAC: similar to LOADgot, with added PAC.
10379// If the GV needs a GOT load, materialize the pointer using the usual
10380// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10381// section is assumed to be read-only (for example, via relro mechanism). See
10382// LowerMOVaddrPAC.
10383//
10384// - LOADauthptrstatic: similar to LOADgot, but use a
10385// special stub slot instead of a GOT slot.
10386// Load a signed pointer for symbol 'sym' from a stub slot named
10387// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10388// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10389// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10390//
10391// All 3 are pseudos that are expand late to longer sequences: this lets us
10392// provide integrity guarantees on the to-be-signed intermediate values.
10393//
10394// LOADauthptrstatic is undesirable because it requires a large section filled
10395// with often similarly-signed pointers, making it a good harvesting target.
10396// Thus, it's only used for ptrauth references to extern_weak to avoid null
10397// checks.
10398
10400 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10401 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10402 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10403 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10404
10405 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10406 // offset alone as a pointer if the symbol wasn't available, which would
10407 // probably break null checks in users. Ptrauth complicates things further:
10408 // error out.
10409 if (TGN->getOffset() != 0)
10411 "unsupported non-zero offset in weak ptrauth global reference");
10412
10413 if (!isNullConstant(AddrDiscriminator))
10414 report_fatal_error("unsupported weak addr-div ptrauth global");
10415
10416 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10417 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10418 {TGA, Key, Discriminator}),
10419 0);
10420}
10421
10422SDValue
10423AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10424 SelectionDAG &DAG) const {
10425 SDValue Ptr = Op.getOperand(0);
10426 uint64_t KeyC = Op.getConstantOperandVal(1);
10427 SDValue AddrDiscriminator = Op.getOperand(2);
10428 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10429 EVT VT = Op.getValueType();
10430 SDLoc DL(Op);
10431
10432 if (KeyC > AArch64PACKey::LAST)
10433 report_fatal_error("key in ptrauth global out of range [0, " +
10434 Twine((int)AArch64PACKey::LAST) + "]");
10435
10436 // Blend only works if the integer discriminator is 16-bit wide.
10437 if (!isUInt<16>(DiscriminatorC))
10439 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10440
10441 // Choosing between 3 lowering alternatives is target-specific.
10442 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10443 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10444
10445 int64_t PtrOffsetC = 0;
10446 if (Ptr.getOpcode() == ISD::ADD) {
10447 PtrOffsetC = Ptr.getConstantOperandVal(1);
10448 Ptr = Ptr.getOperand(0);
10449 }
10450 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10451 const GlobalValue *PtrGV = PtrN->getGlobal();
10452
10453 // Classify the reference to determine whether it needs a GOT load.
10454 const unsigned OpFlags =
10455 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10456 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10457 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10458 "unsupported non-GOT op flags on ptrauth global reference");
10459
10460 // Fold any offset into the GV; our pseudos expect it there.
10461 PtrOffsetC += PtrN->getOffset();
10462 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10463 /*TargetFlags=*/0);
10464 assert(PtrN->getTargetFlags() == 0 &&
10465 "unsupported target flags on ptrauth global");
10466
10467 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10468 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10469 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10470 ? AddrDiscriminator
10471 : DAG.getRegister(AArch64::XZR, MVT::i64);
10472
10473 // No GOT load needed -> MOVaddrPAC
10474 if (!NeedsGOTLoad) {
10475 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10476 return SDValue(
10477 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10478 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10479 0);
10480 }
10481
10482 // GOT load -> LOADgotPAC
10483 // Note that we disallow extern_weak refs to avoid null checks later.
10484 if (!PtrGV->hasExternalWeakLinkage())
10485 return SDValue(
10486 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10487 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10488 0);
10489
10490 // extern_weak ref -> LOADauthptrstatic
10492 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10493 DAG);
10494}
10495
10496// Looks through \param Val to determine the bit that can be used to
10497// check the sign of the value. It returns the unextended value and
10498// the sign bit position.
10499std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10500 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10501 return {Val.getOperand(0),
10502 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10503 1};
10504
10505 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10506 return {Val.getOperand(0),
10507 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10508
10509 return {Val, Val.getValueSizeInBits() - 1};
10510}
10511
10512SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10513 SDValue Chain = Op.getOperand(0);
10514 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10515 SDValue LHS = Op.getOperand(2);
10516 SDValue RHS = Op.getOperand(3);
10517 SDValue Dest = Op.getOperand(4);
10518 SDLoc dl(Op);
10519
10521 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10522 // will not be produced, as they are conditional branch instructions that do
10523 // not set flags.
10524 bool ProduceNonFlagSettingCondBr =
10525 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10526
10527 // Handle f128 first, since lowering it will result in comparing the return
10528 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10529 // is expecting to deal with.
10530 if (LHS.getValueType() == MVT::f128) {
10531 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10532
10533 // If softenSetCCOperands returned a scalar, we need to compare the result
10534 // against zero to select between true and false values.
10535 if (!RHS.getNode()) {
10536 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10537 CC = ISD::SETNE;
10538 }
10539 }
10540
10541 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10542 // instruction.
10543 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
10544 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10545 // Only lower legal XALUO ops.
10546 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10547 return SDValue();
10548
10549 // The actual operation with overflow check.
10551 SDValue Value, Overflow;
10552 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10553
10554 if (CC == ISD::SETNE)
10555 OFCC = getInvertedCondCode(OFCC);
10556 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
10557
10558 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10559 Overflow);
10560 }
10561
10562 if (LHS.getValueType().isInteger()) {
10563 assert((LHS.getValueType() == RHS.getValueType()) &&
10564 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10565
10566 // If the RHS of the comparison is zero, we can potentially fold this
10567 // to a specialized branch.
10568 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10569 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10570 if (CC == ISD::SETEQ) {
10571 // See if we can use a TBZ to fold in an AND as well.
10572 // TBZ has a smaller branch displacement than CBZ. If the offset is
10573 // out of bounds, a late MI-layer pass rewrites branches.
10574 // 403.gcc is an example that hits this case.
10575 if (LHS.getOpcode() == ISD::AND &&
10576 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10577 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10578 SDValue Test = LHS.getOperand(0);
10579 uint64_t Mask = LHS.getConstantOperandVal(1);
10580 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
10581 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10582 Dest);
10583 }
10584
10585 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
10586 } else if (CC == ISD::SETNE) {
10587 // See if we can use a TBZ to fold in an AND as well.
10588 // TBZ has a smaller branch displacement than CBZ. If the offset is
10589 // out of bounds, a late MI-layer pass rewrites branches.
10590 // 403.gcc is an example that hits this case.
10591 if (LHS.getOpcode() == ISD::AND &&
10592 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10593 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10594 SDValue Test = LHS.getOperand(0);
10595 uint64_t Mask = LHS.getConstantOperandVal(1);
10596 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
10597 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10598 Dest);
10599 }
10600
10601 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
10602 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10603 // Don't combine AND since emitComparison converts the AND to an ANDS
10604 // (a.k.a. TST) and the test in the test bit and branch instruction
10605 // becomes redundant. This would also increase register pressure.
10606 uint64_t SignBitPos;
10607 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10608 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
10609 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10610 }
10611 }
10612 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10613 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10614 // Don't combine AND since emitComparison converts the AND to an ANDS
10615 // (a.k.a. TST) and the test in the test bit and branch instruction
10616 // becomes redundant. This would also increase register pressure.
10617 uint64_t SignBitPos;
10618 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10619 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
10620 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10621 }
10622
10623 SDValue CCVal;
10624 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10625 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10626 Cmp);
10627 }
10628
10629 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10630 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10631
10632 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10633 // clean. Some of them require two branches to implement.
10634 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10635 AArch64CC::CondCode CC1, CC2;
10636 changeFPCCToAArch64CC(CC, CC1, CC2);
10637 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10638 SDValue BR1 =
10639 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
10640 if (CC2 != AArch64CC::AL) {
10641 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10642 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
10643 Cmp);
10644 }
10645
10646 return BR1;
10647}
10648
10649SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10650 SelectionDAG &DAG) const {
10651 if (!Subtarget->isNeonAvailable() &&
10652 !Subtarget->useSVEForFixedLengthVectors())
10653 return SDValue();
10654
10655 EVT VT = Op.getValueType();
10656 EVT IntVT = VT.changeTypeToInteger();
10657 SDLoc DL(Op);
10658
10659 SDValue In1 = Op.getOperand(0);
10660 SDValue In2 = Op.getOperand(1);
10661 EVT SrcVT = In2.getValueType();
10662
10663 if (!SrcVT.bitsEq(VT))
10664 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
10665
10666 if (VT.isScalableVector())
10667 IntVT =
10669
10670 if (VT.isFixedLengthVector() &&
10671 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
10672 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10673
10674 In1 = convertToScalableVector(DAG, ContainerVT, In1);
10675 In2 = convertToScalableVector(DAG, ContainerVT, In2);
10676
10677 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
10678 return convertFromScalableVector(DAG, VT, Res);
10679 }
10680
10681 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10682 if (VT.isScalableVector())
10683 return getSVESafeBitCast(VT, Op, DAG);
10684
10685 return DAG.getBitcast(VT, Op);
10686 };
10687
10688 SDValue VecVal1, VecVal2;
10689 EVT VecVT;
10690 auto SetVecVal = [&](int Idx = -1) {
10691 if (!VT.isVector()) {
10692 VecVal1 =
10693 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
10694 VecVal2 =
10695 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
10696 } else {
10697 VecVal1 = BitCast(VecVT, In1, DAG);
10698 VecVal2 = BitCast(VecVT, In2, DAG);
10699 }
10700 };
10701 if (VT.isVector()) {
10702 VecVT = IntVT;
10703 SetVecVal();
10704 } else if (VT == MVT::f64) {
10705 VecVT = MVT::v2i64;
10706 SetVecVal(AArch64::dsub);
10707 } else if (VT == MVT::f32) {
10708 VecVT = MVT::v4i32;
10709 SetVecVal(AArch64::ssub);
10710 } else if (VT == MVT::f16 || VT == MVT::bf16) {
10711 VecVT = MVT::v8i16;
10712 SetVecVal(AArch64::hsub);
10713 } else {
10714 llvm_unreachable("Invalid type for copysign!");
10715 }
10716
10717 unsigned BitWidth = In1.getScalarValueSizeInBits();
10718 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10719
10720 // We want to materialize a mask with every bit but the high bit set, but the
10721 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10722 // 64-bit elements. Instead, materialize all bits set and then negate that.
10723 if (VT == MVT::f64 || VT == MVT::v2f64) {
10724 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10725 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10726 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10727 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10728 }
10729
10730 SDValue BSP =
10731 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
10732 if (VT == MVT::f16 || VT == MVT::bf16)
10733 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
10734 if (VT == MVT::f32)
10735 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
10736 if (VT == MVT::f64)
10737 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
10738
10739 return BitCast(VT, BSP, DAG);
10740}
10741
10742SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10743 SelectionDAG &DAG) const {
10745 Attribute::NoImplicitFloat))
10746 return SDValue();
10747
10748 EVT VT = Op.getValueType();
10749 if (VT.isScalableVector() ||
10751 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
10752
10753 if (!Subtarget->isNeonAvailable())
10754 return SDValue();
10755
10756 bool IsParity = Op.getOpcode() == ISD::PARITY;
10757 SDValue Val = Op.getOperand(0);
10758 SDLoc DL(Op);
10759
10760 // for i32, general parity function using EORs is more efficient compared to
10761 // using floating point
10762 if (VT == MVT::i32 && IsParity)
10763 return SDValue();
10764
10765 // If there is no CNT instruction available, GPR popcount can
10766 // be more efficiently lowered to the following sequence that uses
10767 // AdvSIMD registers/instructions as long as the copies to/from
10768 // the AdvSIMD registers are cheap.
10769 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10770 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10771 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10772 // FMOV X0, D0 // copy result back to integer reg
10773 if (VT == MVT::i32 || VT == MVT::i64) {
10774 if (VT == MVT::i32)
10775 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
10776 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
10777
10778 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10779 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10780 if (VT == MVT::i32)
10781 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
10782 DAG.getConstant(0, DL, MVT::i64));
10783 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10784 if (IsParity)
10785 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10786 return AddV;
10787 } else if (VT == MVT::i128) {
10788 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
10789
10790 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10791 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10792 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10793 if (IsParity)
10794 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10795 return AddV;
10796 }
10797
10798 assert(!IsParity && "ISD::PARITY of vector types not supported");
10799
10800 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10801 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10802 "Unexpected type for custom ctpop lowering");
10803
10804 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10805 Val = DAG.getBitcast(VT8Bit, Val);
10806 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
10807
10808 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10809 VT.getVectorNumElements() >= 2) {
10810 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10811 SDValue Zeros = DAG.getConstant(0, DL, DT);
10812 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
10813
10814 if (VT == MVT::v2i64) {
10815 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10816 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
10817 } else if (VT == MVT::v2i32) {
10818 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10819 } else if (VT == MVT::v4i32) {
10820 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10821 } else {
10822 llvm_unreachable("Unexpected type for custom ctpop lowering");
10823 }
10824
10825 return Val;
10826 }
10827
10828 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10829 unsigned EltSize = 8;
10830 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10831 while (EltSize != VT.getScalarSizeInBits()) {
10832 EltSize *= 2;
10833 NumElts /= 2;
10834 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
10835 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
10836 }
10837
10838 return Val;
10839}
10840
10841SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10842 EVT VT = Op.getValueType();
10843 assert(VT.isScalableVector() ||
10845 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10846
10847 SDLoc DL(Op);
10848 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
10849 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
10850}
10851
10852SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10853 SelectionDAG &DAG) const {
10854
10855 EVT VT = Op.getValueType();
10856 SDLoc DL(Op);
10857 unsigned Opcode = Op.getOpcode();
10859 switch (Opcode) {
10860 default:
10861 llvm_unreachable("Wrong instruction");
10862 case ISD::SMAX:
10863 CC = ISD::SETGT;
10864 break;
10865 case ISD::SMIN:
10866 CC = ISD::SETLT;
10867 break;
10868 case ISD::UMAX:
10869 CC = ISD::SETUGT;
10870 break;
10871 case ISD::UMIN:
10872 CC = ISD::SETULT;
10873 break;
10874 }
10875
10876 if (VT.isScalableVector() ||
10878 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10879 switch (Opcode) {
10880 default:
10881 llvm_unreachable("Wrong instruction");
10882 case ISD::SMAX:
10883 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10884 case ISD::SMIN:
10885 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10886 case ISD::UMAX:
10887 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10888 case ISD::UMIN:
10889 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10890 }
10891 }
10892
10893 SDValue Op0 = Op.getOperand(0);
10894 SDValue Op1 = Op.getOperand(1);
10895 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10896 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10897}
10898
10899SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10900 SelectionDAG &DAG) const {
10901 EVT VT = Op.getValueType();
10902
10903 if (VT.isScalableVector() ||
10905 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10906 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10907
10908 SDLoc DL(Op);
10909 SDValue REVB;
10910 MVT VST;
10911
10912 switch (VT.getSimpleVT().SimpleTy) {
10913 default:
10914 llvm_unreachable("Invalid type for bitreverse!");
10915
10916 case MVT::v2i32: {
10917 VST = MVT::v8i8;
10918 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10919
10920 break;
10921 }
10922
10923 case MVT::v4i32: {
10924 VST = MVT::v16i8;
10925 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10926
10927 break;
10928 }
10929
10930 case MVT::v1i64: {
10931 VST = MVT::v8i8;
10932 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10933
10934 break;
10935 }
10936
10937 case MVT::v2i64: {
10938 VST = MVT::v16i8;
10939 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10940
10941 break;
10942 }
10943 }
10944
10945 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10946 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10947}
10948
10949// Check whether the continuous comparison sequence.
10950static bool
10951isOrXorChain(SDValue N, unsigned &Num,
10952 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10953 if (Num == MaxXors)
10954 return false;
10955
10956 // Skip the one-use zext
10957 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10958 N = N->getOperand(0);
10959
10960 // The leaf node must be XOR
10961 if (N->getOpcode() == ISD::XOR) {
10962 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
10963 Num++;
10964 return true;
10965 }
10966
10967 // All the non-leaf nodes must be OR.
10968 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10969 return false;
10970
10971 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
10972 isOrXorChain(N->getOperand(1), Num, WorkList))
10973 return true;
10974 return false;
10975}
10976
10977// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10979 SDValue LHS = N->getOperand(0);
10980 SDValue RHS = N->getOperand(1);
10981 SDLoc DL(N);
10982 EVT VT = N->getValueType(0);
10984
10985 // Only handle integer compares.
10986 if (N->getOpcode() != ISD::SETCC)
10987 return SDValue();
10988
10989 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
10990 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10991 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10992 unsigned NumXors = 0;
10993 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
10994 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10995 isOrXorChain(LHS, NumXors, WorkList)) {
10996 SDValue XOR0, XOR1;
10997 std::tie(XOR0, XOR1) = WorkList[0];
10998 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10999 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11000 for (unsigned I = 1; I < WorkList.size(); I++) {
11001 std::tie(XOR0, XOR1) = WorkList[I];
11002 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11003 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11004 }
11005
11006 // Exit early by inverting the condition, which help reduce indentations.
11007 return Cmp;
11008 }
11009
11010 return SDValue();
11011}
11012
11013SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11014
11015 if (Op.getValueType().isVector())
11016 return LowerVSETCC(Op, DAG);
11017
11018 bool IsStrict = Op->isStrictFPOpcode();
11019 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11020 unsigned OpNo = IsStrict ? 1 : 0;
11021 SDValue Chain;
11022 if (IsStrict)
11023 Chain = Op.getOperand(0);
11024 SDValue LHS = Op.getOperand(OpNo + 0);
11025 SDValue RHS = Op.getOperand(OpNo + 1);
11026 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11027 SDLoc dl(Op);
11028
11029 // We chose ZeroOrOneBooleanContents, so use zero and one.
11030 EVT VT = Op.getValueType();
11031 SDValue TVal = DAG.getConstant(1, dl, VT);
11032 SDValue FVal = DAG.getConstant(0, dl, VT);
11033
11034 // Handle f128 first, since one possible outcome is a normal integer
11035 // comparison which gets picked up by the next if statement.
11036 if (LHS.getValueType() == MVT::f128) {
11037 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
11038 IsSignaling);
11039
11040 // If softenSetCCOperands returned a scalar, use it.
11041 if (!RHS.getNode()) {
11042 assert(LHS.getValueType() == Op.getValueType() &&
11043 "Unexpected setcc expansion!");
11044 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
11045 }
11046 }
11047
11048 if (LHS.getValueType().isInteger()) {
11049
11050 simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
11051
11052 SDValue CCVal;
11054 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
11055
11056 // Note that we inverted the condition above, so we reverse the order of
11057 // the true and false operands here. This will allow the setcc to be
11058 // matched to a single CSINC instruction.
11059 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
11060 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
11061 }
11062
11063 // Now we know we're dealing with FP values.
11064 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11065 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11066
11067 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11068 // and do the comparison.
11069 SDValue Cmp;
11070 if (IsStrict)
11071 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
11072 else
11073 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11074
11075 AArch64CC::CondCode CC1, CC2;
11076 changeFPCCToAArch64CC(CC, CC1, CC2);
11077 SDValue Res;
11078 if (CC2 == AArch64CC::AL) {
11079 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11080 CC2);
11081 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11082
11083 // Note that we inverted the condition above, so we reverse the order of
11084 // the true and false operands here. This will allow the setcc to be
11085 // matched to a single CSINC instruction.
11086 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
11087 } else {
11088 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11089 // totally clean. Some of them require two CSELs to implement. As is in
11090 // this case, we emit the first CSEL and then emit a second using the output
11091 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11092
11093 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11094 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11095 SDValue CS1 =
11096 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11097
11098 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11099 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11100 }
11101 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
11102}
11103
11104SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11105 SelectionDAG &DAG) const {
11106
11107 SDValue LHS = Op.getOperand(0);
11108 SDValue RHS = Op.getOperand(1);
11109 EVT VT = LHS.getValueType();
11110 if (VT != MVT::i32 && VT != MVT::i64)
11111 return SDValue();
11112
11113 SDLoc DL(Op);
11114 SDValue Carry = Op.getOperand(2);
11115 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11116 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11117 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
11118 LHS, RHS, InvCarry);
11119
11120 EVT OpVT = Op.getValueType();
11121 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11122 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11123
11124 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11126 SDValue CCVal =
11127 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
11128 // Inputs are swapped because the condition is inverted. This will allow
11129 // matching with a single CSINC instruction.
11130 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11131 Cmp.getValue(1));
11132}
11133
11134SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
11135 SDValue RHS, SDValue TVal,
11136 SDValue FVal, const SDLoc &dl,
11137 SelectionDAG &DAG) const {
11138 // Handle f128 first, because it will result in a comparison of some RTLIB
11139 // call result against zero.
11140 if (LHS.getValueType() == MVT::f128) {
11141 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
11142
11143 // If softenSetCCOperands returned a scalar, we need to compare the result
11144 // against zero to select between true and false values.
11145 if (!RHS.getNode()) {
11146 RHS = DAG.getConstant(0, dl, LHS.getValueType());
11147 CC = ISD::SETNE;
11148 }
11149 }
11150
11151 // Also handle f16, for which we need to do a f32 comparison.
11152 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11153 LHS.getValueType() == MVT::bf16) {
11154 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
11155 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
11156 }
11157
11158 // Next, handle integers.
11159 if (LHS.getValueType().isInteger()) {
11160 assert((LHS.getValueType() == RHS.getValueType()) &&
11161 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11162
11163 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11164 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11165 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11166 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
11167 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
11168 // supported types.
11169 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
11170 CTVal->isOne() && CFVal->isAllOnes() &&
11171 LHS.getValueType() == TVal.getValueType()) {
11172 EVT VT = LHS.getValueType();
11173 SDValue Shift =
11174 DAG.getNode(ISD::SRA, dl, VT, LHS,
11175 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11176 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
11177 }
11178
11179 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11180 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11181 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11182 // Both require less instructions than compare and conditional select.
11183 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11184 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11185 LHS.getValueType() == RHS.getValueType()) {
11186 EVT VT = LHS.getValueType();
11187 SDValue Shift =
11188 DAG.getNode(ISD::SRA, dl, VT, LHS,
11189 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11190
11191 if (CC == ISD::SETGT)
11192 Shift = DAG.getNOT(dl, Shift, VT);
11193
11194 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
11195 }
11196
11197 unsigned Opcode = AArch64ISD::CSEL;
11198
11199 // If both the TVal and the FVal are constants, see if we can swap them in
11200 // order to for a CSINV or CSINC out of them.
11201 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11202 std::swap(TVal, FVal);
11203 std::swap(CTVal, CFVal);
11204 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11205 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11206 std::swap(TVal, FVal);
11207 std::swap(CTVal, CFVal);
11208 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11209 } else if (TVal.getOpcode() == ISD::XOR) {
11210 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11211 // with a CSINV rather than a CSEL.
11212 if (isAllOnesConstant(TVal.getOperand(1))) {
11213 std::swap(TVal, FVal);
11214 std::swap(CTVal, CFVal);
11215 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11216 }
11217 } else if (TVal.getOpcode() == ISD::SUB) {
11218 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11219 // that we can match with a CSNEG rather than a CSEL.
11220 if (isNullConstant(TVal.getOperand(0))) {
11221 std::swap(TVal, FVal);
11222 std::swap(CTVal, CFVal);
11223 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11224 }
11225 } else if (CTVal && CFVal) {
11226 const int64_t TrueVal = CTVal->getSExtValue();
11227 const int64_t FalseVal = CFVal->getSExtValue();
11228 bool Swap = false;
11229
11230 // If both TVal and FVal are constants, see if FVal is the
11231 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11232 // instead of a CSEL in that case.
11233 if (TrueVal == ~FalseVal) {
11234 Opcode = AArch64ISD::CSINV;
11235 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11236 TrueVal == -FalseVal) {
11237 Opcode = AArch64ISD::CSNEG;
11238 } else if (TVal.getValueType() == MVT::i32) {
11239 // If our operands are only 32-bit wide, make sure we use 32-bit
11240 // arithmetic for the check whether we can use CSINC. This ensures that
11241 // the addition in the check will wrap around properly in case there is
11242 // an overflow (which would not be the case if we do the check with
11243 // 64-bit arithmetic).
11244 const uint32_t TrueVal32 = CTVal->getZExtValue();
11245 const uint32_t FalseVal32 = CFVal->getZExtValue();
11246
11247 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11248 Opcode = AArch64ISD::CSINC;
11249
11250 if (TrueVal32 > FalseVal32) {
11251 Swap = true;
11252 }
11253 }
11254 } else {
11255 // 64-bit check whether we can use CSINC.
11256 const uint64_t TrueVal64 = TrueVal;
11257 const uint64_t FalseVal64 = FalseVal;
11258
11259 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11260 Opcode = AArch64ISD::CSINC;
11261
11262 if (TrueVal > FalseVal) {
11263 Swap = true;
11264 }
11265 }
11266 }
11267
11268 // Swap TVal and FVal if necessary.
11269 if (Swap) {
11270 std::swap(TVal, FVal);
11271 std::swap(CTVal, CFVal);
11272 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11273 }
11274
11275 if (Opcode != AArch64ISD::CSEL) {
11276 // Drop FVal since we can get its value by simply inverting/negating
11277 // TVal.
11278 FVal = TVal;
11279 }
11280 }
11281
11282 // Avoid materializing a constant when possible by reusing a known value in
11283 // a register. However, don't perform this optimization if the known value
11284 // is one, zero or negative one in the case of a CSEL. We can always
11285 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11286 // FVal, respectively.
11287 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11288 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11289 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11291 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11292 // "a != C ? x : a" to avoid materializing C.
11293 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11294 TVal = LHS;
11295 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11296 FVal = LHS;
11297 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11298 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11299 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11300 // avoid materializing C.
11302 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11303 Opcode = AArch64ISD::CSINV;
11304 TVal = LHS;
11305 FVal = DAG.getConstant(0, dl, FVal.getValueType());
11306 }
11307 }
11308
11309 SDValue CCVal;
11310 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
11311 EVT VT = TVal.getValueType();
11312 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
11313 }
11314
11315 // Now we know we're dealing with FP values.
11316 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11317 LHS.getValueType() == MVT::f64);
11318 assert(LHS.getValueType() == RHS.getValueType());
11319 EVT VT = TVal.getValueType();
11320 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11321
11322 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11323 // clean. Some of them require two CSELs to implement.
11324 AArch64CC::CondCode CC1, CC2;
11325 changeFPCCToAArch64CC(CC, CC1, CC2);
11326
11327 if (DAG.getTarget().Options.UnsafeFPMath) {
11328 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11329 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11330 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11331 if (RHSVal && RHSVal->isZero()) {
11332 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11333 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11334
11335 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11336 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11337 TVal = LHS;
11338 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11339 CFVal && CFVal->isZero() &&
11340 FVal.getValueType() == LHS.getValueType())
11341 FVal = LHS;
11342 }
11343 }
11344
11345 // Emit first, and possibly only, CSEL.
11346 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11347 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11348
11349 // If we need a second CSEL, emit it, using the output of the first as the
11350 // RHS. We're effectively OR'ing the two CC's together.
11351 if (CC2 != AArch64CC::AL) {
11352 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11353 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11354 }
11355
11356 // Otherwise, return the output of the first CSEL.
11357 return CS1;
11358}
11359
11360SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11361 SelectionDAG &DAG) const {
11362 EVT Ty = Op.getValueType();
11363 auto Idx = Op.getConstantOperandAPInt(2);
11364 int64_t IdxVal = Idx.getSExtValue();
11365 assert(Ty.isScalableVector() &&
11366 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11367
11368 // We can use the splice instruction for certain index values where we are
11369 // able to efficiently generate the correct predicate. The index will be
11370 // inverted and used directly as the input to the ptrue instruction, i.e.
11371 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11372 // splice predicate. However, we can only do this if we can guarantee that
11373 // there are enough elements in the vector, hence we check the index <= min
11374 // number of elements.
11375 std::optional<unsigned> PredPattern;
11376 if (Ty.isScalableVector() && IdxVal < 0 &&
11377 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11378 std::nullopt) {
11379 SDLoc DL(Op);
11380
11381 // Create a predicate where all but the last -IdxVal elements are false.
11382 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11383 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11384 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11385
11386 // Now splice the two inputs together using the predicate.
11387 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11388 Op.getOperand(1));
11389 }
11390
11391 // We can select to an EXT instruction when indexing the first 256 bytes.
11393 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11394 return Op;
11395
11396 return SDValue();
11397}
11398
11399SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11400 SelectionDAG &DAG) const {
11401 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11402 SDValue LHS = Op.getOperand(0);
11403 SDValue RHS = Op.getOperand(1);
11404 SDValue TVal = Op.getOperand(2);
11405 SDValue FVal = Op.getOperand(3);
11406 SDLoc DL(Op);
11407 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11408}
11409
11410SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11411 SelectionDAG &DAG) const {
11412 SDValue CCVal = Op->getOperand(0);
11413 SDValue TVal = Op->getOperand(1);
11414 SDValue FVal = Op->getOperand(2);
11415 SDLoc DL(Op);
11416
11417 EVT Ty = Op.getValueType();
11418 if (Ty == MVT::aarch64svcount) {
11419 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
11420 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
11421 SDValue Sel =
11422 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
11423 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
11424 }
11425
11426 if (Ty.isScalableVector()) {
11427 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
11428 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
11429 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11430 }
11431
11432 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
11433 // FIXME: Ideally this would be the same as above using i1 types, however
11434 // for the moment we can't deal with fixed i1 vector types properly, so
11435 // instead extend the predicate to a result type sized integer vector.
11436 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
11437 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
11438 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
11439 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
11440 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11441 }
11442
11443 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11444 // instruction.
11445 if (ISD::isOverflowIntrOpRes(CCVal)) {
11446 // Only lower legal XALUO ops.
11447 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
11448 return SDValue();
11449
11451 SDValue Value, Overflow;
11452 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
11453 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
11454
11455 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
11456 CCVal, Overflow);
11457 }
11458
11459 // Lower it the same way as we would lower a SELECT_CC node.
11461 SDValue LHS, RHS;
11462 if (CCVal.getOpcode() == ISD::SETCC) {
11463 LHS = CCVal.getOperand(0);
11464 RHS = CCVal.getOperand(1);
11465 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
11466 } else {
11467 LHS = CCVal;
11468 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
11469 CC = ISD::SETNE;
11470 }
11471
11472 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11473 // order to use FCSELSrrr
11474 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11475 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11476 DAG.getUNDEF(MVT::f32), TVal);
11477 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11478 DAG.getUNDEF(MVT::f32), FVal);
11479 }
11480
11481 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11482
11483 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11484 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
11485 }
11486
11487 return Res;
11488}
11489
11490SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11491 SelectionDAG &DAG) const {
11492 // Jump table entries as PC relative offsets. No additional tweaking
11493 // is necessary here. Just get the address of the jump table.
11494 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11495
11498 !Subtarget->isTargetMachO())
11499 return getAddrLarge(JT, DAG);
11500 if (CM == CodeModel::Tiny)
11501 return getAddrTiny(JT, DAG);
11502 return getAddr(JT, DAG);
11503}
11504
11505SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11506 SelectionDAG &DAG) const {
11507 // Jump table entries as PC relative offsets. No additional tweaking
11508 // is necessary here. Just get the address of the jump table.
11509 SDLoc DL(Op);
11510 SDValue JT = Op.getOperand(1);
11511 SDValue Entry = Op.getOperand(2);
11512 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
11513
11514 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11515 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
11516
11517 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11518 // sequence later, to guarantee the integrity of the intermediate values.
11520 "aarch64-jump-table-hardening")) {
11522 if (Subtarget->isTargetMachO()) {
11523 if (CM != CodeModel::Small && CM != CodeModel::Large)
11524 report_fatal_error("Unsupported code-model for hardened jump-table");
11525 } else {
11526 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11527 assert(Subtarget->isTargetELF() &&
11528 "jump table hardening only supported on MachO/ELF");
11529 if (CM != CodeModel::Small)
11530 report_fatal_error("Unsupported code-model for hardened jump-table");
11531 }
11532
11533 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
11534 Entry, SDValue());
11535 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
11536 DAG.getTargetJumpTable(JTI, MVT::i32),
11537 X16Copy.getValue(0), X16Copy.getValue(1));
11538 return SDValue(B, 0);
11539 }
11540
11541 SDNode *Dest =
11542 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
11543 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
11544 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
11545 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
11546}
11547
11548SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11549 SDValue Chain = Op.getOperand(0);
11550 SDValue Dest = Op.getOperand(1);
11551
11552 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11553 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11554 if (Dest->isMachineOpcode() &&
11555 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
11556 return SDValue();
11557
11558 const MachineFunction &MF = DAG.getMachineFunction();
11559 std::optional<uint16_t> BADisc =
11561 if (!BADisc)
11562 return SDValue();
11563
11564 SDLoc DL(Op);
11565
11566 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11568 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11569
11570 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
11571 {Dest, Key, Disc, AddrDisc, Chain});
11572 return SDValue(BrA, 0);
11573}
11574
11575SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11576 SelectionDAG &DAG) const {
11577 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
11579 if (CM == CodeModel::Large) {
11580 // Use the GOT for the large code model on iOS.
11581 if (Subtarget->isTargetMachO()) {
11582 return getGOT(CP, DAG);
11583 }
11585 return getAddrLarge(CP, DAG);
11586 } else if (CM == CodeModel::Tiny) {
11587 return getAddrTiny(CP, DAG);
11588 }
11589 return getAddr(CP, DAG);
11590}
11591
11592SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11593 SelectionDAG &DAG) const {
11594 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
11595 const BlockAddress *BA = BAN->getBlockAddress();
11596
11597 if (std::optional<uint16_t> BADisc =
11599 *BA->getFunction())) {
11600 SDLoc DL(Op);
11601
11602 // This isn't cheap, but BRIND is rare.
11603 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
11604
11605 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11606
11608 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11609
11610 SDNode *MOV =
11611 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
11612 {TargetBA, Key, AddrDisc, Disc});
11613 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
11614 SDValue(MOV, 1));
11615 }
11616
11618 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11620 return getAddrLarge(BAN, DAG);
11621 } else if (CM == CodeModel::Tiny) {
11622 return getAddrTiny(BAN, DAG);
11623 }
11624 return getAddr(BAN, DAG);
11625}
11626
11627SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11628 SelectionDAG &DAG) const {
11629 AArch64FunctionInfo *FuncInfo =
11631
11632 SDLoc DL(Op);
11633 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
11635 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
11636 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11637 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11638 MachinePointerInfo(SV));
11639}
11640
11641SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11642 SelectionDAG &DAG) const {
11645
11646 SDLoc DL(Op);
11647 SDValue FR;
11648 if (Subtarget->isWindowsArm64EC()) {
11649 // With the Arm64EC ABI, we compute the address of the varargs save area
11650 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11651 // but calls from an entry thunk can pass in a different address.
11652 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
11653 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
11655 if (FuncInfo->getVarArgsGPRSize() > 0)
11656 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11657 else
11658 StackOffset = FuncInfo->getVarArgsStackOffset();
11659 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
11660 DAG.getConstant(StackOffset, DL, MVT::i64));
11661 } else {
11662 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
11663 ? FuncInfo->getVarArgsGPRIndex()
11664 : FuncInfo->getVarArgsStackIndex(),
11666 }
11667 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11668 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11669 MachinePointerInfo(SV));
11670}
11671
11672SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11673 SelectionDAG &DAG) const {
11674 // The layout of the va_list struct is specified in the AArch64 Procedure Call
11675 // Standard, section B.3.
11678 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11679 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11680 auto PtrVT = getPointerTy(DAG.getDataLayout());
11681 SDLoc DL(Op);
11682
11683 SDValue Chain = Op.getOperand(0);
11684 SDValue VAList = Op.getOperand(1);
11685 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11687
11688 // void *__stack at offset 0
11689 unsigned Offset = 0;
11690 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
11691 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
11692 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
11693 MachinePointerInfo(SV), Align(PtrSize)));
11694
11695 // void *__gr_top at offset 8 (4 on ILP32)
11696 Offset += PtrSize;
11697 int GPRSize = FuncInfo->getVarArgsGPRSize();
11698 if (GPRSize > 0) {
11699 SDValue GRTop, GRTopAddr;
11700
11701 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11702 DAG.getConstant(Offset, DL, PtrVT));
11703
11704 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
11705 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
11706 DAG.getSignedConstant(GPRSize, DL, PtrVT));
11707 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
11708
11709 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
11711 Align(PtrSize)));
11712 }
11713
11714 // void *__vr_top at offset 16 (8 on ILP32)
11715 Offset += PtrSize;
11716 int FPRSize = FuncInfo->getVarArgsFPRSize();
11717 if (FPRSize > 0) {
11718 SDValue VRTop, VRTopAddr;
11719 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11720 DAG.getConstant(Offset, DL, PtrVT));
11721
11722 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
11723 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
11724 DAG.getSignedConstant(FPRSize, DL, PtrVT));
11725 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
11726
11727 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
11729 Align(PtrSize)));
11730 }
11731
11732 // int __gr_offs at offset 24 (12 on ILP32)
11733 Offset += PtrSize;
11734 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11735 DAG.getConstant(Offset, DL, PtrVT));
11736 MemOps.push_back(
11737 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
11738 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11739
11740 // int __vr_offs at offset 28 (16 on ILP32)
11741 Offset += 4;
11742 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11743 DAG.getConstant(Offset, DL, PtrVT));
11744 MemOps.push_back(
11745 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
11746 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11747
11748 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
11749}
11750
11751SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11752 SelectionDAG &DAG) const {
11754 Function &F = MF.getFunction();
11755
11756 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
11757 return LowerWin64_VASTART(Op, DAG);
11758 else if (Subtarget->isTargetDarwin())
11759 return LowerDarwin_VASTART(Op, DAG);
11760 else
11761 return LowerAAPCS_VASTART(Op, DAG);
11762}
11763
11764SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11765 SelectionDAG &DAG) const {
11766 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11767 // pointer.
11768 SDLoc DL(Op);
11769 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11770 unsigned VaListSize =
11771 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11772 ? PtrSize
11773 : Subtarget->isTargetILP32() ? 20 : 32;
11774 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
11775 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
11776
11777 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
11778 DAG.getConstant(VaListSize, DL, MVT::i32),
11779 Align(PtrSize), false, false, /*CI=*/nullptr,
11780 std::nullopt, MachinePointerInfo(DestSV),
11781 MachinePointerInfo(SrcSV));
11782}
11783
11784SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11785 assert(Subtarget->isTargetDarwin() &&
11786 "automatic va_arg instruction only works on Darwin");
11787
11788 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11789 EVT VT = Op.getValueType();
11790 SDLoc DL(Op);
11791 SDValue Chain = Op.getOperand(0);
11792 SDValue Addr = Op.getOperand(1);
11793 MaybeAlign Align(Op.getConstantOperandVal(3));
11794 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11795 auto PtrVT = getPointerTy(DAG.getDataLayout());
11796 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11797 SDValue VAList =
11798 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
11799 Chain = VAList.getValue(1);
11800 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
11801
11802 if (VT.isScalableVector())
11803 report_fatal_error("Passing SVE types to variadic functions is "
11804 "currently not supported");
11805
11806 if (Align && *Align > MinSlotSize) {
11807 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11808 DAG.getConstant(Align->value() - 1, DL, PtrVT));
11809 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
11810 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
11811 }
11812
11813 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
11814 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
11815
11816 // Scalar integer and FP values smaller than 64 bits are implicitly extended
11817 // up to 64 bits. At the very least, we have to increase the striding of the
11818 // vaargs list to match this, and for FP values we need to introduce
11819 // FP_ROUND nodes as well.
11820 if (VT.isInteger() && !VT.isVector())
11821 ArgSize = std::max(ArgSize, MinSlotSize);
11822 bool NeedFPTrunc = false;
11823 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11824 ArgSize = 8;
11825 NeedFPTrunc = true;
11826 }
11827
11828 // Increment the pointer, VAList, to the next vaarg
11829 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11830 DAG.getConstant(ArgSize, DL, PtrVT));
11831 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
11832
11833 // Store the incremented VAList to the legalized pointer
11834 SDValue APStore =
11835 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
11836
11837 // Load the actual argument out of the pointer VAList
11838 if (NeedFPTrunc) {
11839 // Load the value as an f64.
11840 SDValue WideFP =
11841 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
11842 // Round the value down to an f32.
11843 SDValue NarrowFP =
11844 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
11845 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
11846 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
11847 // Merge the rounded value with the chain output of the load.
11848 return DAG.getMergeValues(Ops, DL);
11849 }
11850
11851 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
11852}
11853
11854SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11855 SelectionDAG &DAG) const {
11857 MFI.setFrameAddressIsTaken(true);
11858
11859 EVT VT = Op.getValueType();
11860 SDLoc DL(Op);
11861 unsigned Depth = Op.getConstantOperandVal(0);
11862 SDValue FrameAddr =
11863 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
11864 while (Depth--)
11865 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
11867
11868 if (Subtarget->isTargetILP32())
11869 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
11870 DAG.getValueType(VT));
11871
11872 return FrameAddr;
11873}
11874
11875SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11876 SelectionDAG &DAG) const {
11878
11879 EVT VT = getPointerTy(DAG.getDataLayout());
11880 SDLoc DL(Op);
11881 int FI = MFI.CreateFixedObject(4, 0, false);
11882 return DAG.getFrameIndex(FI, VT);
11883}
11884
11885#define GET_REGISTER_MATCHER
11886#include "AArch64GenAsmMatcher.inc"
11887
11888// FIXME? Maybe this could be a TableGen attribute on some registers and
11889// this table could be generated automatically from RegInfo.
11890Register AArch64TargetLowering::
11891getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11893 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11894 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11895 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
11896 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
11897 !MRI->isReservedReg(MF, Reg))
11898 Reg = 0;
11899 }
11900 if (Reg)
11901 return Reg;
11902 report_fatal_error(Twine("Invalid register name \""
11903 + StringRef(RegName) + "\"."));
11904}
11905
11906SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11907 SelectionDAG &DAG) const {
11909
11910 EVT VT = Op.getValueType();
11911 SDLoc DL(Op);
11912
11913 SDValue FrameAddr =
11914 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
11916
11917 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
11918}
11919
11920SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11921 SelectionDAG &DAG) const {
11923 MachineFrameInfo &MFI = MF.getFrameInfo();
11924 MFI.setReturnAddressIsTaken(true);
11925
11926 EVT VT = Op.getValueType();
11927 SDLoc DL(Op);
11928 unsigned Depth = Op.getConstantOperandVal(0);
11929 SDValue ReturnAddress;
11930 if (Depth) {
11931 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11933 ReturnAddress = DAG.getLoad(
11934 VT, DL, DAG.getEntryNode(),
11935 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11936 } else {
11937 // Return LR, which contains the return address. Mark it an implicit
11938 // live-in.
11939 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11940 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11941 }
11942
11943 // The XPACLRI instruction assembles to a hint-space instruction before
11944 // Armv8.3-A therefore this instruction can be safely used for any pre
11945 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11946 // that instead.
11947 SDNode *St;
11948 if (Subtarget->hasPAuth()) {
11949 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11950 } else {
11951 // XPACLRI operates on LR therefore we must move the operand accordingly.
11952 SDValue Chain =
11953 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11954 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11955 }
11956 return SDValue(St, 0);
11957}
11958
11959/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11960/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11961SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11962 SelectionDAG &DAG) const {
11963 SDValue Lo, Hi;
11964 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
11965 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
11966}
11967
11969 const GlobalAddressSDNode *GA) const {
11970 // Offsets are folded in the DAG combine rather than here so that we can
11971 // intelligently choose an offset based on the uses.
11972 return false;
11973}
11974
11976 bool OptForSize) const {
11977 bool IsLegal = false;
11978 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11979 // 16-bit case when target has full fp16 support.
11980 // We encode bf16 bit patterns as if they were fp16. This results in very
11981 // strange looking assembly but should populate the register with appropriate
11982 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11983 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11984 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11985 // FIXME: We should be able to handle f128 as well with a clever lowering.
11986 const APInt ImmInt = Imm.bitcastToAPInt();
11987 if (VT == MVT::f64)
11988 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
11989 else if (VT == MVT::f32)
11990 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
11991 else if (VT == MVT::f16 || VT == MVT::bf16)
11992 IsLegal =
11993 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
11994 Imm.isPosZero();
11995
11996 // If we can not materialize in immediate field for fmov, check if the
11997 // value can be encoded as the immediate operand of a logical instruction.
11998 // The immediate value will be created with either MOVZ, MOVN, or ORR.
11999 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12000 // generate that fmov.
12001 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12002 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12003 // however the mov+fmov sequence is always better because of the reduced
12004 // cache pressure. The timings are still the same if you consider
12005 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12006 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12009 assert(Insn.size() <= 4 &&
12010 "Should be able to build any value with at most 4 moves");
12011 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12012 IsLegal = Insn.size() <= Limit;
12013 }
12014
12015 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12016 << " imm value: "; Imm.dump(););
12017 return IsLegal;
12018}
12019
12020//===----------------------------------------------------------------------===//
12021// AArch64 Optimization Hooks
12022//===----------------------------------------------------------------------===//
12023
12024static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12025 SDValue Operand, SelectionDAG &DAG,
12026 int &ExtraSteps) {
12027 EVT VT = Operand.getValueType();
12028 if ((ST->hasNEON() &&
12029 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12030 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12031 VT == MVT::v4f32)) ||
12032 (ST->hasSVE() &&
12033 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12035 // For the reciprocal estimates, convergence is quadratic, so the number
12036 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12037 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12038 // the result for float (23 mantissa bits) is 2 and for double (52
12039 // mantissa bits) is 3.
12040 constexpr unsigned AccurateBits = 8;
12041 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12042 ExtraSteps = DesiredBits <= AccurateBits
12043 ? 0
12044 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12045 }
12046
12047 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12048 }
12049
12050 return SDValue();
12051}
12052
12053SDValue
12054AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12055 const DenormalMode &Mode) const {
12056 SDLoc DL(Op);
12057 EVT VT = Op.getValueType();
12058 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12059 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12060 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12061}
12062
12063SDValue
12064AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12065 SelectionDAG &DAG) const {
12066 return Op;
12067}
12068
12069SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12070 SelectionDAG &DAG, int Enabled,
12071 int &ExtraSteps,
12072 bool &UseOneConst,
12073 bool Reciprocal) const {
12075 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12076 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12077 DAG, ExtraSteps)) {
12078 SDLoc DL(Operand);
12079 EVT VT = Operand.getValueType();
12080
12082
12083 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12084 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12085 for (int i = ExtraSteps; i > 0; --i) {
12086 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12087 Flags);
12088 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12089 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12090 }
12091 if (!Reciprocal)
12092 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12093
12094 ExtraSteps = 0;
12095 return Estimate;
12096 }
12097
12098 return SDValue();
12099}
12100
12101SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12102 SelectionDAG &DAG, int Enabled,
12103 int &ExtraSteps) const {
12105 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12106 DAG, ExtraSteps)) {
12107 SDLoc DL(Operand);
12108 EVT VT = Operand.getValueType();
12109
12111
12112 // Newton reciprocal iteration: E * (2 - X * E)
12113 // AArch64 reciprocal iteration instruction: (2 - M * N)
12114 for (int i = ExtraSteps; i > 0; --i) {
12115 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12116 Estimate, Flags);
12117 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12118 }
12119
12120 ExtraSteps = 0;
12121 return Estimate;
12122 }
12123
12124 return SDValue();
12125}
12126
12127//===----------------------------------------------------------------------===//
12128// AArch64 Inline Assembly Support
12129//===----------------------------------------------------------------------===//
12130
12131// Table of Constraints
12132// TODO: This is the current set of constraints supported by ARM for the
12133// compiler, not all of them may make sense.
12134//
12135// r - A general register
12136// w - An FP/SIMD register of some size in the range v0-v31
12137// x - An FP/SIMD register of some size in the range v0-v15
12138// I - Constant that can be used with an ADD instruction
12139// J - Constant that can be used with a SUB instruction
12140// K - Constant that can be used with a 32-bit logical instruction
12141// L - Constant that can be used with a 64-bit logical instruction
12142// M - Constant that can be used as a 32-bit MOV immediate
12143// N - Constant that can be used as a 64-bit MOV immediate
12144// Q - A memory reference with base register and no offset
12145// S - A symbolic address
12146// Y - Floating point constant zero
12147// Z - Integer constant zero
12148//
12149// Note that general register operands will be output using their 64-bit x
12150// register name, whatever the size of the variable, unless the asm operand
12151// is prefixed by the %w modifier. Floating-point and SIMD register operands
12152// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12153// %q modifier.
12154const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12155 // At this point, we have to lower this constraint to something else, so we
12156 // lower it to an "r" or "w". However, by doing this we will force the result
12157 // to be in register, while the X constraint is much more permissive.
12158 //
12159 // Although we are correct (we are free to emit anything, without
12160 // constraints), we might break use cases that would expect us to be more
12161 // efficient and emit something else.
12162 if (!Subtarget->hasFPARMv8())
12163 return "r";
12164
12165 if (ConstraintVT.isFloatingPoint())
12166 return "w";
12167
12168 if (ConstraintVT.isVector() &&
12169 (ConstraintVT.getSizeInBits() == 64 ||
12170 ConstraintVT.getSizeInBits() == 128))
12171 return "w";
12172
12173 return "r";
12174}
12175
12177
12178// Returns a {Reg, RegisterClass} tuple if the constraint is
12179// a specific predicate register.
12180//
12181// For some constraint like "{pn3}" the default path in
12182// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12183// suitable register class for this register is "PPRorPNR", after which it
12184// determines that nxv16i1 is an appropriate type for the constraint, which is
12185// not what we want. The code here pre-empts this by matching the register
12186// explicitly.
12187static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12189 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12190 Constraint[1] != 'p')
12191 return std::nullopt;
12192
12193 Constraint = Constraint.substr(2, Constraint.size() - 3);
12194 bool IsPredicateAsCount = Constraint.starts_with("n");
12195 if (IsPredicateAsCount)
12196 Constraint = Constraint.drop_front(1);
12197
12198 unsigned V;
12199 if (Constraint.getAsInteger(10, V) || V > 31)
12200 return std::nullopt;
12201
12202 if (IsPredicateAsCount)
12203 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12204 else
12205 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12206}
12207
12208static std::optional<PredicateConstraint>
12211 .Case("Uph", PredicateConstraint::Uph)
12212 .Case("Upl", PredicateConstraint::Upl)
12213 .Case("Upa", PredicateConstraint::Upa)
12214 .Default(std::nullopt);
12215}
12216
12217static const TargetRegisterClass *
12219 if (VT != MVT::aarch64svcount &&
12220 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12221 return nullptr;
12222
12223 switch (Constraint) {
12224 case PredicateConstraint::Uph:
12225 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12226 : &AArch64::PPR_p8to15RegClass;
12227 case PredicateConstraint::Upl:
12228 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12229 : &AArch64::PPR_3bRegClass;
12230 case PredicateConstraint::Upa:
12231 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12232 : &AArch64::PPRRegClass;
12233 }
12234
12235 llvm_unreachable("Missing PredicateConstraint!");
12236}
12237
12239
12240static std::optional<ReducedGprConstraint>
12243 .Case("Uci", ReducedGprConstraint::Uci)
12244 .Case("Ucj", ReducedGprConstraint::Ucj)
12245 .Default(std::nullopt);
12246}
12247
12248static const TargetRegisterClass *
12250 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12251 return nullptr;
12252
12253 switch (Constraint) {
12254 case ReducedGprConstraint::Uci:
12255 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12256 case ReducedGprConstraint::Ucj:
12257 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12258 }
12259
12260 llvm_unreachable("Missing ReducedGprConstraint!");
12261}
12262
12263// The set of cc code supported is from
12264// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12267 .Case("{@cchi}", AArch64CC::HI)
12268 .Case("{@cccs}", AArch64CC::HS)
12269 .Case("{@cclo}", AArch64CC::LO)
12270 .Case("{@ccls}", AArch64CC::LS)
12271 .Case("{@cccc}", AArch64CC::LO)
12272 .Case("{@cceq}", AArch64CC::EQ)
12273 .Case("{@ccgt}", AArch64CC::GT)
12274 .Case("{@ccge}", AArch64CC::GE)
12275 .Case("{@cclt}", AArch64CC::LT)
12276 .Case("{@ccle}", AArch64CC::LE)
12277 .Case("{@cchs}", AArch64CC::HS)
12278 .Case("{@ccne}", AArch64CC::NE)
12279 .Case("{@ccvc}", AArch64CC::VC)
12280 .Case("{@ccpl}", AArch64CC::PL)
12281 .Case("{@ccvs}", AArch64CC::VS)
12282 .Case("{@ccmi}", AArch64CC::MI)
12284 return Cond;
12285}
12286
12287/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12288/// WZR, invert(<cond>)'.
12290 SelectionDAG &DAG) {
12291 return DAG.getNode(
12292 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
12293 DAG.getConstant(0, DL, MVT::i32),
12294 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
12295}
12296
12297// Lower @cc flag output via getSETCC.
12298SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12299 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12300 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12301 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12302 if (Cond == AArch64CC::Invalid)
12303 return SDValue();
12304 // The output variable should be a scalar integer.
12305 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12306 OpInfo.ConstraintVT.getSizeInBits() < 8)
12307 report_fatal_error("Flag output operand is of invalid type");
12308
12309 // Get NZCV register. Only update chain when copyfrom is glued.
12310 if (Glue.getNode()) {
12311 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
12312 Chain = Glue.getValue(1);
12313 } else
12314 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
12315 // Extract CC code.
12316 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12317
12319
12320 // Truncate or ZERO_EXTEND based on value types.
12321 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12322 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12323 else
12324 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12325
12326 return Result;
12327}
12328
12329/// getConstraintType - Given a constraint letter, return the type of
12330/// constraint it is for this target.
12332AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12333 if (Constraint.size() == 1) {
12334 switch (Constraint[0]) {
12335 default:
12336 break;
12337 case 'x':
12338 case 'w':
12339 case 'y':
12340 return C_RegisterClass;
12341 // An address with a single base register. Due to the way we
12342 // currently handle addresses it is the same as 'r'.
12343 case 'Q':
12344 return C_Memory;
12345 case 'I':
12346 case 'J':
12347 case 'K':
12348 case 'L':
12349 case 'M':
12350 case 'N':
12351 case 'Y':
12352 case 'Z':
12353 return C_Immediate;
12354 case 'z':
12355 case 'S': // A symbol or label reference with a constant offset
12356 return C_Other;
12357 }
12358 } else if (parsePredicateConstraint(Constraint))
12359 return C_RegisterClass;
12360 else if (parseReducedGprConstraint(Constraint))
12361 return C_RegisterClass;
12362 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12363 return C_Other;
12364 return TargetLowering::getConstraintType(Constraint);
12365}
12366
12367/// Examine constraint type and operand type and determine a weight value.
12368/// This object must already have been set up with the operand type
12369/// and the current alternative constraint selected.
12371AArch64TargetLowering::getSingleConstraintMatchWeight(
12372 AsmOperandInfo &info, const char *constraint) const {
12374 Value *CallOperandVal = info.CallOperandVal;
12375 // If we don't have a value, we can't do a match,
12376 // but allow it at the lowest weight.
12377 if (!CallOperandVal)
12378 return CW_Default;
12379 Type *type = CallOperandVal->getType();
12380 // Look at the constraint type.
12381 switch (*constraint) {
12382 default:
12384 break;
12385 case 'x':
12386 case 'w':
12387 case 'y':
12388 if (type->isFloatingPointTy() || type->isVectorTy())
12389 weight = CW_Register;
12390 break;
12391 case 'z':
12392 weight = CW_Constant;
12393 break;
12394 case 'U':
12395 if (parsePredicateConstraint(constraint) ||
12396 parseReducedGprConstraint(constraint))
12397 weight = CW_Register;
12398 break;
12399 }
12400 return weight;
12401}
12402
12403std::pair<unsigned, const TargetRegisterClass *>
12404AArch64TargetLowering::getRegForInlineAsmConstraint(
12405 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12406 if (Constraint.size() == 1) {
12407 switch (Constraint[0]) {
12408 case 'r':
12409 if (VT.isScalableVector())
12410 return std::make_pair(0U, nullptr);
12411 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12412 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
12413 if (VT.getFixedSizeInBits() == 64)
12414 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
12415 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
12416 case 'w': {
12417 if (!Subtarget->hasFPARMv8())
12418 break;
12419 if (VT.isScalableVector()) {
12420 if (VT.getVectorElementType() != MVT::i1)
12421 return std::make_pair(0U, &AArch64::ZPRRegClass);
12422 return std::make_pair(0U, nullptr);
12423 }
12424 if (VT == MVT::Other)
12425 break;
12426 uint64_t VTSize = VT.getFixedSizeInBits();
12427 if (VTSize == 16)
12428 return std::make_pair(0U, &AArch64::FPR16RegClass);
12429 if (VTSize == 32)
12430 return std::make_pair(0U, &AArch64::FPR32RegClass);
12431 if (VTSize == 64)
12432 return std::make_pair(0U, &AArch64::FPR64RegClass);
12433 if (VTSize == 128)
12434 return std::make_pair(0U, &AArch64::FPR128RegClass);
12435 break;
12436 }
12437 // The instructions that this constraint is designed for can
12438 // only take 128-bit registers so just use that regclass.
12439 case 'x':
12440 if (!Subtarget->hasFPARMv8())
12441 break;
12442 if (VT.isScalableVector())
12443 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
12444 if (VT.getSizeInBits() == 128)
12445 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
12446 break;
12447 case 'y':
12448 if (!Subtarget->hasFPARMv8())
12449 break;
12450 if (VT.isScalableVector())
12451 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
12452 break;
12453 }
12454 } else {
12455 if (const auto P = parsePredicateRegAsConstraint(Constraint))
12456 return *P;
12457 if (const auto PC = parsePredicateConstraint(Constraint))
12458 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
12459 return std::make_pair(0U, RegClass);
12460
12461 if (const auto RGC = parseReducedGprConstraint(Constraint))
12462 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
12463 return std::make_pair(0U, RegClass);
12464 }
12465 if (StringRef("{cc}").equals_insensitive(Constraint) ||
12467 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
12468
12469 if (Constraint == "{za}") {
12470 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
12471 }
12472
12473 if (Constraint == "{zt0}") {
12474 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
12475 }
12476
12477 // Use the default implementation in TargetLowering to convert the register
12478 // constraint into a member of a register class.
12479 std::pair<unsigned, const TargetRegisterClass *> Res;
12481
12482 // Not found as a standard register?
12483 if (!Res.second) {
12484 unsigned Size = Constraint.size();
12485 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12486 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12487 int RegNo;
12488 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
12489 if (!Failed && RegNo >= 0 && RegNo <= 31) {
12490 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12491 // By default we'll emit v0-v31 for this unless there's a modifier where
12492 // we'll emit the correct register as well.
12493 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12494 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
12495 Res.second = &AArch64::FPR64RegClass;
12496 } else {
12497 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
12498 Res.second = &AArch64::FPR128RegClass;
12499 }
12500 }
12501 }
12502 }
12503
12504 if (Res.second && !Subtarget->hasFPARMv8() &&
12505 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
12506 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
12507 return std::make_pair(0U, nullptr);
12508
12509 return Res;
12510}
12511
12513 llvm::Type *Ty,
12514 bool AllowUnknown) const {
12515 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
12516 return EVT(MVT::i64x8);
12517
12518 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12519}
12520
12521/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12522/// vector. If it is invalid, don't add anything to Ops.
12523void AArch64TargetLowering::LowerAsmOperandForConstraint(
12524 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12525 SelectionDAG &DAG) const {
12526 SDValue Result;
12527
12528 // Currently only support length 1 constraints.
12529 if (Constraint.size() != 1)
12530 return;
12531
12532 char ConstraintLetter = Constraint[0];
12533 switch (ConstraintLetter) {
12534 default:
12535 break;
12536
12537 // This set of constraints deal with valid constants for various instructions.
12538 // Validate and return a target constant for them if we can.
12539 case 'z': {
12540 // 'z' maps to xzr or wzr so it needs an input of 0.
12541 if (!isNullConstant(Op))
12542 return;
12543
12544 if (Op.getValueType() == MVT::i64)
12545 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
12546 else
12547 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
12548 break;
12549 }
12550 case 'S':
12551 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
12552 // supported for PIC while "s" isn't, making "s" less useful. We implement
12553 // "S" but not "s".
12555 break;
12556
12557 case 'I':
12558 case 'J':
12559 case 'K':
12560 case 'L':
12561 case 'M':
12562 case 'N':
12563 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12564 if (!C)
12565 return;
12566
12567 // Grab the value and do some validation.
12568 uint64_t CVal = C->getZExtValue();
12569 switch (ConstraintLetter) {
12570 // The I constraint applies only to simple ADD or SUB immediate operands:
12571 // i.e. 0 to 4095 with optional shift by 12
12572 // The J constraint applies only to ADD or SUB immediates that would be
12573 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
12574 // instruction [or vice versa], in other words -1 to -4095 with optional
12575 // left shift by 12.
12576 case 'I':
12577 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
12578 break;
12579 return;
12580 case 'J': {
12581 uint64_t NVal = -C->getSExtValue();
12582 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
12583 CVal = C->getSExtValue();
12584 break;
12585 }
12586 return;
12587 }
12588 // The K and L constraints apply *only* to logical immediates, including
12589 // what used to be the MOVI alias for ORR (though the MOVI alias has now
12590 // been removed and MOV should be used). So these constraints have to
12591 // distinguish between bit patterns that are valid 32-bit or 64-bit
12592 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12593 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12594 // versa.
12595 case 'K':
12596 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12597 break;
12598 return;
12599 case 'L':
12600 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12601 break;
12602 return;
12603 // The M and N constraints are a superset of K and L respectively, for use
12604 // with the MOV (immediate) alias. As well as the logical immediates they
12605 // also match 32 or 64-bit immediates that can be loaded either using a
12606 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
12607 // (M) or 64-bit 0x1234000000000000 (N) etc.
12608 // As a note some of this code is liberally stolen from the asm parser.
12609 case 'M': {
12610 if (!isUInt<32>(CVal))
12611 return;
12612 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12613 break;
12614 if ((CVal & 0xFFFF) == CVal)
12615 break;
12616 if ((CVal & 0xFFFF0000ULL) == CVal)
12617 break;
12618 uint64_t NCVal = ~(uint32_t)CVal;
12619 if ((NCVal & 0xFFFFULL) == NCVal)
12620 break;
12621 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12622 break;
12623 return;
12624 }
12625 case 'N': {
12626 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12627 break;
12628 if ((CVal & 0xFFFFULL) == CVal)
12629 break;
12630 if ((CVal & 0xFFFF0000ULL) == CVal)
12631 break;
12632 if ((CVal & 0xFFFF00000000ULL) == CVal)
12633 break;
12634 if ((CVal & 0xFFFF000000000000ULL) == CVal)
12635 break;
12636 uint64_t NCVal = ~CVal;
12637 if ((NCVal & 0xFFFFULL) == NCVal)
12638 break;
12639 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12640 break;
12641 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
12642 break;
12643 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
12644 break;
12645 return;
12646 }
12647 default:
12648 return;
12649 }
12650
12651 // All assembler immediates are 64-bit integers.
12652 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
12653 break;
12654 }
12655
12656 if (Result.getNode()) {
12657 Ops.push_back(Result);
12658 return;
12659 }
12660
12661 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12662}
12663
12664//===----------------------------------------------------------------------===//
12665// AArch64 Advanced SIMD Support
12666//===----------------------------------------------------------------------===//
12667
12668/// WidenVector - Given a value in the V64 register class, produce the
12669/// equivalent value in the V128 register class.
12671 EVT VT = V64Reg.getValueType();
12672 unsigned NarrowSize = VT.getVectorNumElements();
12673 MVT EltTy = VT.getVectorElementType().getSimpleVT();
12674 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
12675 SDLoc DL(V64Reg);
12676
12677 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
12678 V64Reg, DAG.getConstant(0, DL, MVT::i64));
12679}
12680
12681/// getExtFactor - Determine the adjustment factor for the position when
12682/// generating an "extract from vector registers" instruction.
12683static unsigned getExtFactor(SDValue &V) {
12684 EVT EltType = V.getValueType().getVectorElementType();
12685 return EltType.getSizeInBits() / 8;
12686}
12687
12688// Check if a vector is built from one vector via extracted elements of
12689// another together with an AND mask, ensuring that all elements fit
12690// within range. This can be reconstructed using AND and NEON's TBL1.
12692 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12693 SDLoc dl(Op);
12694 EVT VT = Op.getValueType();
12695 assert(!VT.isScalableVector() &&
12696 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12697
12698 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
12699 // directly to TBL1.
12700 if (VT != MVT::v16i8 && VT != MVT::v8i8)
12701 return SDValue();
12702
12703 unsigned NumElts = VT.getVectorNumElements();
12704 assert((NumElts == 8 || NumElts == 16) &&
12705 "Need to have exactly 8 or 16 elements in vector.");
12706
12707 SDValue SourceVec;
12708 SDValue MaskSourceVec;
12709 SmallVector<SDValue, 16> AndMaskConstants;
12710
12711 for (unsigned i = 0; i < NumElts; ++i) {
12712 SDValue V = Op.getOperand(i);
12713 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12714 return SDValue();
12715
12716 SDValue OperandSourceVec = V.getOperand(0);
12717 if (!SourceVec)
12718 SourceVec = OperandSourceVec;
12719 else if (SourceVec != OperandSourceVec)
12720 return SDValue();
12721
12722 // This only looks at shuffles with elements that are
12723 // a) truncated by a constant AND mask extracted from a mask vector, or
12724 // b) extracted directly from a mask vector.
12725 SDValue MaskSource = V.getOperand(1);
12726 if (MaskSource.getOpcode() == ISD::AND) {
12727 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
12728 return SDValue();
12729
12730 AndMaskConstants.push_back(MaskSource.getOperand(1));
12731 MaskSource = MaskSource->getOperand(0);
12732 } else if (!AndMaskConstants.empty()) {
12733 // Either all or no operands should have an AND mask.
12734 return SDValue();
12735 }
12736
12737 // An ANY_EXTEND may be inserted between the AND and the source vector
12738 // extraction. We don't care about that, so we can just skip it.
12739 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
12740 MaskSource = MaskSource.getOperand(0);
12741
12742 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12743 return SDValue();
12744
12745 SDValue MaskIdx = MaskSource.getOperand(1);
12746 if (!isa<ConstantSDNode>(MaskIdx) ||
12747 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
12748 return SDValue();
12749
12750 // We only apply this if all elements come from the same vector with the
12751 // same vector type.
12752 if (!MaskSourceVec) {
12753 MaskSourceVec = MaskSource->getOperand(0);
12754 if (MaskSourceVec.getValueType() != VT)
12755 return SDValue();
12756 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
12757 return SDValue();
12758 }
12759 }
12760
12761 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12762 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12763 // insert, we know that the index in the mask must be smaller than the number
12764 // of elements in the source, or we would have an out-of-bounds access.
12765 if (NumElts == 8)
12766 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
12767 DAG.getUNDEF(VT));
12768
12769 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12770 if (!AndMaskConstants.empty())
12771 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
12772 DAG.getBuildVector(VT, dl, AndMaskConstants));
12773
12774 return DAG.getNode(
12776 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
12777 MaskSourceVec);
12778}
12779
12780// Gather data to see if the operation can be modelled as a
12781// shuffle in combination with VEXTs.
12783 SelectionDAG &DAG) const {
12784 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12785 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12786 SDLoc dl(Op);
12787 EVT VT = Op.getValueType();
12788 assert(!VT.isScalableVector() &&
12789 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12790 unsigned NumElts = VT.getVectorNumElements();
12791
12792 struct ShuffleSourceInfo {
12793 SDValue Vec;
12794 unsigned MinElt;
12795 unsigned MaxElt;
12796
12797 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12798 // be compatible with the shuffle we intend to construct. As a result
12799 // ShuffleVec will be some sliding window into the original Vec.
12800 SDValue ShuffleVec;
12801
12802 // Code should guarantee that element i in Vec starts at element "WindowBase
12803 // + i * WindowScale in ShuffleVec".
12804 int WindowBase;
12805 int WindowScale;
12806
12807 ShuffleSourceInfo(SDValue Vec)
12808 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12809 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12810
12811 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12812 };
12813
12814 // First gather all vectors used as an immediate source for this BUILD_VECTOR
12815 // node.
12817 for (unsigned i = 0; i < NumElts; ++i) {
12818 SDValue V = Op.getOperand(i);
12819 if (V.isUndef())
12820 continue;
12821 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12822 !isa<ConstantSDNode>(V.getOperand(1)) ||
12823 V.getOperand(0).getValueType().isScalableVector()) {
12824 LLVM_DEBUG(
12825 dbgs() << "Reshuffle failed: "
12826 "a shuffle can only come from building a vector from "
12827 "various elements of other fixed-width vectors, provided "
12828 "their indices are constant\n");
12829 return SDValue();
12830 }
12831
12832 // Add this element source to the list if it's not already there.
12833 SDValue SourceVec = V.getOperand(0);
12834 auto Source = find(Sources, SourceVec);
12835 if (Source == Sources.end())
12836 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
12837
12838 // Update the minimum and maximum lane number seen.
12839 unsigned EltNo = V.getConstantOperandVal(1);
12840 Source->MinElt = std::min(Source->MinElt, EltNo);
12841 Source->MaxElt = std::max(Source->MaxElt, EltNo);
12842 }
12843
12844 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12845 // better than moving to/from gpr registers for larger vectors.
12846 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12847 // Construct a mask for the tbl. We may need to adjust the index for types
12848 // larger than i8.
12850 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12851 for (unsigned I = 0; I < NumElts; ++I) {
12852 SDValue V = Op.getOperand(I);
12853 if (V.isUndef()) {
12854 for (unsigned OF = 0; OF < OutputFactor; OF++)
12855 Mask.push_back(-1);
12856 continue;
12857 }
12858 // Set the Mask lanes adjusted for the size of the input and output
12859 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12860 // output element, adjusted in their positions per input and output types.
12861 unsigned Lane = V.getConstantOperandVal(1);
12862 for (unsigned S = 0; S < Sources.size(); S++) {
12863 if (V.getOperand(0) == Sources[S].Vec) {
12864 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12865 unsigned InputBase = 16 * S + Lane * InputSize / 8;
12866 for (unsigned OF = 0; OF < OutputFactor; OF++)
12867 Mask.push_back(InputBase + OF);
12868 break;
12869 }
12870 }
12871 }
12872
12873 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12874 // v16i8, and the TBLMask
12875 SmallVector<SDValue, 16> TBLOperands;
12876 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
12877 ? Intrinsic::aarch64_neon_tbl3
12878 : Intrinsic::aarch64_neon_tbl4,
12879 dl, MVT::i32));
12880 for (unsigned i = 0; i < Sources.size(); i++) {
12881 SDValue Src = Sources[i].Vec;
12882 EVT SrcVT = Src.getValueType();
12883 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
12884 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12885 "Expected a legally typed vector");
12886 if (SrcVT.is64BitVector())
12887 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
12888 DAG.getUNDEF(MVT::v8i8));
12889 TBLOperands.push_back(Src);
12890 }
12891
12893 for (unsigned i = 0; i < Mask.size(); i++)
12894 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
12895 assert((Mask.size() == 8 || Mask.size() == 16) &&
12896 "Expected a v8i8 or v16i8 Mask");
12897 TBLOperands.push_back(
12898 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
12899
12900 SDValue Shuffle =
12902 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
12903 return DAG.getBitcast(VT, Shuffle);
12904 }
12905
12906 if (Sources.size() > 2) {
12907 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12908 << "sensible when at most two source vectors are "
12909 << "involved\n");
12910 return SDValue();
12911 }
12912
12913 // Find out the smallest element size among result and two sources, and use
12914 // it as element size to build the shuffle_vector.
12915 EVT SmallestEltTy = VT.getVectorElementType();
12916 for (auto &Source : Sources) {
12917 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12918 if (SrcEltTy.bitsLT(SmallestEltTy)) {
12919 SmallestEltTy = SrcEltTy;
12920 }
12921 }
12922 unsigned ResMultiplier =
12923 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12924 uint64_t VTSize = VT.getFixedSizeInBits();
12925 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12926 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
12927
12928 // If the source vector is too wide or too narrow, we may nevertheless be able
12929 // to construct a compatible shuffle either by concatenating it with UNDEF or
12930 // extracting a suitable range of elements.
12931 for (auto &Src : Sources) {
12932 EVT SrcVT = Src.ShuffleVec.getValueType();
12933
12934 TypeSize SrcVTSize = SrcVT.getSizeInBits();
12935 if (SrcVTSize == TypeSize::getFixed(VTSize))
12936 continue;
12937
12938 // This stage of the search produces a source with the same element type as
12939 // the original, but with a total width matching the BUILD_VECTOR output.
12940 EVT EltVT = SrcVT.getVectorElementType();
12941 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12942 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
12943
12944 if (SrcVTSize.getFixedValue() < VTSize) {
12945 assert(2 * SrcVTSize == VTSize);
12946 // We can pad out the smaller vector for free, so if it's part of a
12947 // shuffle...
12948 Src.ShuffleVec =
12949 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
12950 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
12951 continue;
12952 }
12953
12954 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
12955 LLVM_DEBUG(
12956 dbgs() << "Reshuffle failed: result vector too small to extract\n");
12957 return SDValue();
12958 }
12959
12960 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12961 LLVM_DEBUG(
12962 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
12963 return SDValue();
12964 }
12965
12966 if (Src.MinElt >= NumSrcElts) {
12967 // The extraction can just take the second half
12968 Src.ShuffleVec =
12969 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12970 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12971 Src.WindowBase = -NumSrcElts;
12972 } else if (Src.MaxElt < NumSrcElts) {
12973 // The extraction can just take the first half
12974 Src.ShuffleVec =
12975 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12976 DAG.getConstant(0, dl, MVT::i64));
12977 } else {
12978 // An actual VEXT is needed
12979 SDValue VEXTSrc1 =
12980 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12981 DAG.getConstant(0, dl, MVT::i64));
12982 SDValue VEXTSrc2 =
12983 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12984 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12985 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
12986
12987 if (!SrcVT.is64BitVector()) {
12988 LLVM_DEBUG(
12989 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12990 "for SVE vectors.");
12991 return SDValue();
12992 }
12993
12994 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
12995 VEXTSrc2,
12996 DAG.getConstant(Imm, dl, MVT::i32));
12997 Src.WindowBase = -Src.MinElt;
12998 }
12999 }
13000
13001 // Another possible incompatibility occurs from the vector element types. We
13002 // can fix this by bitcasting the source vectors to the same type we intend
13003 // for the shuffle.
13004 for (auto &Src : Sources) {
13005 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13006 if (SrcEltTy == SmallestEltTy)
13007 continue;
13008 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13009 if (DAG.getDataLayout().isBigEndian()) {
13010 Src.ShuffleVec =
13011 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
13012 } else {
13013 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
13014 }
13015 Src.WindowScale =
13016 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13017 Src.WindowBase *= Src.WindowScale;
13018 }
13019
13020 // Final check before we try to actually produce a shuffle.
13021 LLVM_DEBUG({
13022 for (auto Src : Sources)
13023 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13024 });
13025
13026 // The stars all align, our next step is to produce the mask for the shuffle.
13027 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13028 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13029 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13030 SDValue Entry = Op.getOperand(i);
13031 if (Entry.isUndef())
13032 continue;
13033
13034 auto Src = find(Sources, Entry.getOperand(0));
13035 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13036
13037 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13038 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13039 // segment.
13040 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13041 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13042 VT.getScalarSizeInBits());
13043 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13044
13045 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13046 // starting at the appropriate offset.
13047 int *LaneMask = &Mask[i * ResMultiplier];
13048
13049 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13050 ExtractBase += NumElts * (Src - Sources.begin());
13051 for (int j = 0; j < LanesDefined; ++j)
13052 LaneMask[j] = ExtractBase + j;
13053 }
13054
13055 // Final check before we try to produce nonsense...
13056 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13057 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13058 return SDValue();
13059 }
13060
13061 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13062 for (unsigned i = 0; i < Sources.size(); ++i)
13063 ShuffleOps[i] = Sources[i].ShuffleVec;
13064
13065 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
13066 ShuffleOps[1], Mask);
13067 SDValue V;
13068 if (DAG.getDataLayout().isBigEndian()) {
13069 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
13070 } else {
13071 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
13072 }
13073
13074 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13075 dbgs() << "Reshuffle, creating node: "; V.dump(););
13076
13077 return V;
13078}
13079
13080// check if an EXT instruction can handle the shuffle mask when the
13081// vector sources of the shuffle are the same.
13082static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13083 unsigned NumElts = VT.getVectorNumElements();
13084
13085 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13086 if (M[0] < 0)
13087 return false;
13088
13089 Imm = M[0];
13090
13091 // If this is a VEXT shuffle, the immediate value is the index of the first
13092 // element. The other shuffle indices must be the successive elements after
13093 // the first one.
13094 unsigned ExpectedElt = Imm;
13095 for (unsigned i = 1; i < NumElts; ++i) {
13096 // Increment the expected index. If it wraps around, just follow it
13097 // back to index zero and keep going.
13098 ++ExpectedElt;
13099 if (ExpectedElt == NumElts)
13100 ExpectedElt = 0;
13101
13102 if (M[i] < 0)
13103 continue; // ignore UNDEF indices
13104 if (ExpectedElt != static_cast<unsigned>(M[i]))
13105 return false;
13106 }
13107
13108 return true;
13109}
13110
13111// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13112// v4i32s. This is really a truncate, which we can construct out of (legal)
13113// concats and truncate nodes.
13115 if (V.getValueType() != MVT::v16i8)
13116 return SDValue();
13117 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13118
13119 for (unsigned X = 0; X < 4; X++) {
13120 // Check the first item in each group is an extract from lane 0 of a v4i32
13121 // or v4i16.
13122 SDValue BaseExt = V.getOperand(X * 4);
13123 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13124 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13125 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13126 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13127 BaseExt.getConstantOperandVal(1) != 0)
13128 return SDValue();
13129 SDValue Base = BaseExt.getOperand(0);
13130 // And check the other items are extracts from the same vector.
13131 for (unsigned Y = 1; Y < 4; Y++) {
13132 SDValue Ext = V.getOperand(X * 4 + Y);
13133 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13134 Ext.getOperand(0) != Base ||
13135 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13136 Ext.getConstantOperandVal(1) != Y)
13137 return SDValue();
13138 }
13139 }
13140
13141 // Turn the buildvector into a series of truncates and concates, which will
13142 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13143 // concat together to produce 2 v8i16. These are both truncated and concat
13144 // together.
13145 SDLoc DL(V);
13146 SDValue Trunc[4] = {
13147 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13148 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13149 for (SDValue &V : Trunc)
13150 if (V.getValueType() == MVT::v4i32)
13151 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13152 SDValue Concat0 =
13153 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13154 SDValue Concat1 =
13155 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13156 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13157 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13158 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13159}
13160
13161/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13162/// element width than the vector lane type. If that is the case the function
13163/// returns true and writes the value of the DUP instruction lane operand into
13164/// DupLaneOp
13165static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13166 unsigned &DupLaneOp) {
13167 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13168 "Only possible block sizes for wide DUP are: 16, 32, 64");
13169
13170 if (BlockSize <= VT.getScalarSizeInBits())
13171 return false;
13172 if (BlockSize % VT.getScalarSizeInBits() != 0)
13173 return false;
13174 if (VT.getSizeInBits() % BlockSize != 0)
13175 return false;
13176
13177 size_t SingleVecNumElements = VT.getVectorNumElements();
13178 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13179 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13180
13181 // We are looking for masks like
13182 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13183 // might be replaced by 'undefined'. BlockIndices will eventually contain
13184 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13185 // for the above examples)
13186 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13187 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13188 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13189 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13190 if (Elt < 0)
13191 continue;
13192 // For now we don't support shuffles that use the second operand
13193 if ((unsigned)Elt >= SingleVecNumElements)
13194 return false;
13195 if (BlockElts[I] < 0)
13196 BlockElts[I] = Elt;
13197 else if (BlockElts[I] != Elt)
13198 return false;
13199 }
13200
13201 // We found a candidate block (possibly with some undefs). It must be a
13202 // sequence of consecutive integers starting with a value divisible by
13203 // NumEltsPerBlock with some values possibly replaced by undef-s.
13204
13205 // Find first non-undef element
13206 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13207 assert(FirstRealEltIter != BlockElts.end() &&
13208 "Shuffle with all-undefs must have been caught by previous cases, "
13209 "e.g. isSplat()");
13210 if (FirstRealEltIter == BlockElts.end()) {
13211 DupLaneOp = 0;
13212 return true;
13213 }
13214
13215 // Index of FirstRealElt in BlockElts
13216 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13217
13218 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13219 return false;
13220 // BlockElts[0] must have the following value if it isn't undef:
13221 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13222
13223 // Check the first element
13224 if (Elt0 % NumEltsPerBlock != 0)
13225 return false;
13226 // Check that the sequence indeed consists of consecutive integers (modulo
13227 // undefs)
13228 for (size_t I = 0; I < NumEltsPerBlock; I++)
13229 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13230 return false;
13231
13232 DupLaneOp = Elt0 / NumEltsPerBlock;
13233 return true;
13234}
13235
13236// check if an EXT instruction can handle the shuffle mask when the
13237// vector sources of the shuffle are different.
13238static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13239 unsigned &Imm) {
13240 // Look for the first non-undef element.
13241 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13242
13243 // Benefit form APInt to handle overflow when calculating expected element.
13244 unsigned NumElts = VT.getVectorNumElements();
13245 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13246 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13247 /*implicitTrunc=*/true);
13248 // The following shuffle indices must be the successive elements after the
13249 // first real element.
13250 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13251 return Elt != ExpectedElt++ && Elt != -1;
13252 });
13253 if (FoundWrongElt)
13254 return false;
13255
13256 // The index of an EXT is the first element if it is not UNDEF.
13257 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13258 // value of the first element. E.g.
13259 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13260 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13261 // ExpectedElt is the last mask index plus 1.
13262 Imm = ExpectedElt.getZExtValue();
13263
13264 // There are two difference cases requiring to reverse input vectors.
13265 // For example, for vector <4 x i32> we have the following cases,
13266 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13267 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13268 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13269 // to reverse two input vectors.
13270 if (Imm < NumElts)
13271 ReverseEXT = true;
13272 else
13273 Imm -= NumElts;
13274
13275 return true;
13276}
13277
13278/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13279/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13280/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13281static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13282 unsigned NumElts = VT.getVectorNumElements();
13283 if (NumElts % 2 != 0)
13284 return false;
13285 WhichResult = (M[0] == 0 ? 0 : 1);
13286 unsigned Idx = WhichResult * NumElts / 2;
13287 for (unsigned i = 0; i != NumElts; i += 2) {
13288 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13289 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13290 return false;
13291 Idx += 1;
13292 }
13293
13294 return true;
13295}
13296
13297/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13298/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13299/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13300static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13301 unsigned Half = VT.getVectorNumElements() / 2;
13302 WhichResult = (M[0] == 0 ? 0 : 1);
13303 for (unsigned j = 0; j != 2; ++j) {
13304 unsigned Idx = WhichResult;
13305 for (unsigned i = 0; i != Half; ++i) {
13306 int MIdx = M[i + j * Half];
13307 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13308 return false;
13309 Idx += 2;
13310 }
13311 }
13312
13313 return true;
13314}
13315
13316/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13317/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13318/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13319static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13320 unsigned NumElts = VT.getVectorNumElements();
13321 if (NumElts % 2 != 0)
13322 return false;
13323 WhichResult = (M[0] == 0 ? 0 : 1);
13324 for (unsigned i = 0; i < NumElts; i += 2) {
13325 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13326 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13327 return false;
13328 }
13329 return true;
13330}
13331
13332static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13333 bool &DstIsLeft, int &Anomaly) {
13334 if (M.size() != static_cast<size_t>(NumInputElements))
13335 return false;
13336
13337 int NumLHSMatch = 0, NumRHSMatch = 0;
13338 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13339
13340 for (int i = 0; i < NumInputElements; ++i) {
13341 if (M[i] == -1) {
13342 ++NumLHSMatch;
13343 ++NumRHSMatch;
13344 continue;
13345 }
13346
13347 if (M[i] == i)
13348 ++NumLHSMatch;
13349 else
13350 LastLHSMismatch = i;
13351
13352 if (M[i] == i + NumInputElements)
13353 ++NumRHSMatch;
13354 else
13355 LastRHSMismatch = i;
13356 }
13357
13358 if (NumLHSMatch == NumInputElements - 1) {
13359 DstIsLeft = true;
13360 Anomaly = LastLHSMismatch;
13361 return true;
13362 } else if (NumRHSMatch == NumInputElements - 1) {
13363 DstIsLeft = false;
13364 Anomaly = LastRHSMismatch;
13365 return true;
13366 }
13367
13368 return false;
13369}
13370
13371static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13372 if (VT.getSizeInBits() != 128)
13373 return false;
13374
13375 unsigned NumElts = VT.getVectorNumElements();
13376
13377 for (int I = 0, E = NumElts / 2; I != E; I++) {
13378 if (Mask[I] != I)
13379 return false;
13380 }
13381
13382 int Offset = NumElts / 2;
13383 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13384 if (Mask[I] != I + SplitLHS * Offset)
13385 return false;
13386 }
13387
13388 return true;
13389}
13390
13392 SDLoc DL(Op);
13393 EVT VT = Op.getValueType();
13394 SDValue V0 = Op.getOperand(0);
13395 SDValue V1 = Op.getOperand(1);
13396 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13397
13400 return SDValue();
13401
13402 bool SplitV0 = V0.getValueSizeInBits() == 128;
13403
13404 if (!isConcatMask(Mask, VT, SplitV0))
13405 return SDValue();
13406
13407 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13408 if (SplitV0) {
13409 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
13410 DAG.getConstant(0, DL, MVT::i64));
13411 }
13412 if (V1.getValueSizeInBits() == 128) {
13413 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
13414 DAG.getConstant(0, DL, MVT::i64));
13415 }
13416 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
13417}
13418
13419/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13420/// the specified operations to build the shuffle. ID is the perfect-shuffle
13421//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13422//table entry and LHS/RHS are the immediate inputs for this stage of the
13423//shuffle.
13425 SDValue V2, unsigned PFEntry, SDValue LHS,
13426 SDValue RHS, SelectionDAG &DAG,
13427 const SDLoc &dl) {
13428 unsigned OpNum = (PFEntry >> 26) & 0x0F;
13429 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13430 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13431
13432 enum {
13433 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13434 OP_VREV,
13435 OP_VDUP0,
13436 OP_VDUP1,
13437 OP_VDUP2,
13438 OP_VDUP3,
13439 OP_VEXT1,
13440 OP_VEXT2,
13441 OP_VEXT3,
13442 OP_VUZPL, // VUZP, left result
13443 OP_VUZPR, // VUZP, right result
13444 OP_VZIPL, // VZIP, left result
13445 OP_VZIPR, // VZIP, right result
13446 OP_VTRNL, // VTRN, left result
13447 OP_VTRNR, // VTRN, right result
13448 OP_MOVLANE // Move lane. RHSID is the lane to move into
13449 };
13450
13451 if (OpNum == OP_COPY) {
13452 if (LHSID == (1 * 9 + 2) * 9 + 3)
13453 return LHS;
13454 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13455 return RHS;
13456 }
13457
13458 if (OpNum == OP_MOVLANE) {
13459 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13460 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13461 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13462 Elt = 3 - Elt;
13463 while (Elt > 0) {
13464 ID /= 9;
13465 Elt--;
13466 }
13467 return (ID % 9 == 8) ? -1 : ID % 9;
13468 };
13469
13470 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13471 // get the lane to move from the PFID, which is always from the
13472 // original vectors (V1 or V2).
13474 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
13475 EVT VT = OpLHS.getValueType();
13476 assert(RHSID < 8 && "Expected a lane index for RHSID!");
13477 unsigned ExtLane = 0;
13478 SDValue Input;
13479
13480 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13481 // convert into a higher type.
13482 if (RHSID & 0x4) {
13483 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13484 if (MaskElt == -1)
13485 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13486 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13487 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13488 Input = MaskElt < 2 ? V1 : V2;
13489 if (VT.getScalarSizeInBits() == 16) {
13490 Input = DAG.getBitcast(MVT::v2f32, Input);
13491 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
13492 } else {
13493 assert(VT.getScalarSizeInBits() == 32 &&
13494 "Expected 16 or 32 bit shuffle elemements");
13495 Input = DAG.getBitcast(MVT::v2f64, Input);
13496 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
13497 }
13498 } else {
13499 int MaskElt = getPFIDLane(ID, RHSID);
13500 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13501 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13502 Input = MaskElt < 4 ? V1 : V2;
13503 // Be careful about creating illegal types. Use f16 instead of i16.
13504 if (VT == MVT::v4i16) {
13505 Input = DAG.getBitcast(MVT::v4f16, Input);
13506 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
13507 }
13508 }
13511 Input, DAG.getVectorIdxConstant(ExtLane, dl));
13512 SDValue Ins =
13513 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
13514 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
13515 return DAG.getBitcast(VT, Ins);
13516 }
13517
13518 SDValue OpLHS, OpRHS;
13519 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
13520 RHS, DAG, dl);
13521 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
13522 RHS, DAG, dl);
13523 EVT VT = OpLHS.getValueType();
13524
13525 switch (OpNum) {
13526 default:
13527 llvm_unreachable("Unknown shuffle opcode!");
13528 case OP_VREV:
13529 // VREV divides the vector in half and swaps within the half.
13530 if (VT.getVectorElementType() == MVT::i32 ||
13531 VT.getVectorElementType() == MVT::f32)
13532 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
13533 // vrev <4 x i16> -> REV32
13534 if (VT.getVectorElementType() == MVT::i16 ||
13535 VT.getVectorElementType() == MVT::f16 ||
13536 VT.getVectorElementType() == MVT::bf16)
13537 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
13538 // vrev <4 x i8> -> REV16
13539 assert(VT.getVectorElementType() == MVT::i8);
13540 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
13541 case OP_VDUP0:
13542 case OP_VDUP1:
13543 case OP_VDUP2:
13544 case OP_VDUP3: {
13545 EVT EltTy = VT.getVectorElementType();
13546 unsigned Opcode;
13547 if (EltTy == MVT::i8)
13548 Opcode = AArch64ISD::DUPLANE8;
13549 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
13550 Opcode = AArch64ISD::DUPLANE16;
13551 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
13552 Opcode = AArch64ISD::DUPLANE32;
13553 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
13554 Opcode = AArch64ISD::DUPLANE64;
13555 else
13556 llvm_unreachable("Invalid vector element type?");
13557
13558 if (VT.getSizeInBits() == 64)
13559 OpLHS = WidenVector(OpLHS, DAG);
13560 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
13561 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
13562 }
13563 case OP_VEXT1:
13564 case OP_VEXT2:
13565 case OP_VEXT3: {
13566 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
13567 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
13568 DAG.getConstant(Imm, dl, MVT::i32));
13569 }
13570 case OP_VUZPL:
13571 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
13572 case OP_VUZPR:
13573 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
13574 case OP_VZIPL:
13575 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
13576 case OP_VZIPR:
13577 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
13578 case OP_VTRNL:
13579 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
13580 case OP_VTRNR:
13581 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
13582 }
13583}
13584
13586 SelectionDAG &DAG) {
13587 // Check to see if we can use the TBL instruction.
13588 SDValue V1 = Op.getOperand(0);
13589 SDValue V2 = Op.getOperand(1);
13590 SDLoc DL(Op);
13591
13592 EVT EltVT = Op.getValueType().getVectorElementType();
13593 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
13594
13595 bool Swap = false;
13596 if (V1.isUndef() || isZerosVector(V1.getNode())) {
13597 std::swap(V1, V2);
13598 Swap = true;
13599 }
13600
13601 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13602 // out of range values with 0s. We do need to make sure that any out-of-range
13603 // values are really out-of-range for a v16i8 vector.
13604 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
13605 MVT IndexVT = MVT::v8i8;
13606 unsigned IndexLen = 8;
13607 if (Op.getValueSizeInBits() == 128) {
13608 IndexVT = MVT::v16i8;
13609 IndexLen = 16;
13610 }
13611
13613 for (int Val : ShuffleMask) {
13614 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
13615 unsigned Offset = Byte + Val * BytesPerElt;
13616 if (Swap)
13617 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13618 if (IsUndefOrZero && Offset >= IndexLen)
13619 Offset = 255;
13620 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
13621 }
13622 }
13623
13624 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
13625 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
13626
13627 SDValue Shuffle;
13628 if (IsUndefOrZero) {
13629 if (IndexLen == 8)
13630 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
13631 Shuffle = DAG.getNode(
13632 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13633 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13634 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13635 } else {
13636 if (IndexLen == 8) {
13637 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
13638 Shuffle = DAG.getNode(
13639 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13640 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13641 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13642 } else {
13643 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13644 // cannot currently represent the register constraints on the input
13645 // table registers.
13646 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13647 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13648 // IndexLen));
13649 Shuffle = DAG.getNode(
13650 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13651 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
13652 V2Cst,
13653 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13654 }
13655 }
13656 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
13657}
13658
13659static unsigned getDUPLANEOp(EVT EltType) {
13660 if (EltType == MVT::i8)
13661 return AArch64ISD::DUPLANE8;
13662 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
13663 return AArch64ISD::DUPLANE16;
13664 if (EltType == MVT::i32 || EltType == MVT::f32)
13665 return AArch64ISD::DUPLANE32;
13666 if (EltType == MVT::i64 || EltType == MVT::f64)
13667 return AArch64ISD::DUPLANE64;
13668
13669 llvm_unreachable("Invalid vector element type?");
13670}
13671
13672static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
13673 unsigned Opcode, SelectionDAG &DAG) {
13674 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
13675 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
13676 // Match: dup (bitcast (extract_subv X, C)), LaneC
13677 if (BitCast.getOpcode() != ISD::BITCAST ||
13679 return false;
13680
13681 // The extract index must align in the destination type. That may not
13682 // happen if the bitcast is from narrow to wide type.
13683 SDValue Extract = BitCast.getOperand(0);
13684 unsigned ExtIdx = Extract.getConstantOperandVal(1);
13685 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
13686 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
13687 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
13688 if (ExtIdxInBits % CastedEltBitWidth != 0)
13689 return false;
13690
13691 // Can't handle cases where vector size is not 128-bit
13692 if (!Extract.getOperand(0).getValueType().is128BitVector())
13693 return false;
13694
13695 // Update the lane value by offsetting with the scaled extract index.
13696 LaneC += ExtIdxInBits / CastedEltBitWidth;
13697
13698 // Determine the casted vector type of the wide vector input.
13699 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
13700 // Examples:
13701 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
13702 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
13703 unsigned SrcVecNumElts =
13704 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
13706 SrcVecNumElts);
13707 return true;
13708 };
13709 MVT CastVT;
13710 if (getScaledOffsetDup(V, Lane, CastVT)) {
13711 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
13712 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13713 V.getOperand(0).getValueType().is128BitVector()) {
13714 // The lane is incremented by the index of the extract.
13715 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
13716 Lane += V.getConstantOperandVal(1);
13717 V = V.getOperand(0);
13718 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
13719 // The lane is decremented if we are splatting from the 2nd operand.
13720 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
13721 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
13722 Lane -= Idx * VT.getVectorNumElements() / 2;
13723 V = WidenVector(V.getOperand(Idx), DAG);
13724 } else if (VT.getSizeInBits() == 64) {
13725 // Widen the operand to 128-bit register with undef.
13726 V = WidenVector(V, DAG);
13727 }
13728 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
13729}
13730
13731// Try to widen element type to get a new mask value for a better permutation
13732// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13733// UZP1/2, TRN1/2, REV, INS, etc.
13734// For example:
13735// shufflevector <4 x i32> %a, <4 x i32> %b,
13736// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13737// is equivalent to:
13738// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13739// Finally, we can get:
13740// mov v0.d[0], v1.d[1]
13742 SDLoc DL(Op);
13743 EVT VT = Op.getValueType();
13744 EVT ScalarVT = VT.getVectorElementType();
13745 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13746 SDValue V0 = Op.getOperand(0);
13747 SDValue V1 = Op.getOperand(1);
13748 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13749
13750 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13751 // We need to make sure the wider element type is legal. Thus, ElementSize
13752 // should be not larger than 32 bits, and i1 type should also be excluded.
13753 if (ElementSize > 32 || ElementSize == 1)
13754 return SDValue();
13755
13756 SmallVector<int, 8> NewMask;
13757 if (widenShuffleMaskElts(Mask, NewMask)) {
13758 MVT NewEltVT = VT.isFloatingPoint()
13759 ? MVT::getFloatingPointVT(ElementSize * 2)
13760 : MVT::getIntegerVT(ElementSize * 2);
13761 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13762 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13763 V0 = DAG.getBitcast(NewVT, V0);
13764 V1 = DAG.getBitcast(NewVT, V1);
13765 return DAG.getBitcast(VT,
13766 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
13767 }
13768 }
13769
13770 return SDValue();
13771}
13772
13773// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13775 ArrayRef<int> ShuffleMask,
13776 SelectionDAG &DAG) {
13777 SDValue Tbl1 = Op->getOperand(0);
13778 SDValue Tbl2 = Op->getOperand(1);
13779 SDLoc dl(Op);
13780 SDValue Tbl2ID =
13781 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
13782
13783 EVT VT = Op.getValueType();
13784 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13785 Tbl1->getOperand(0) != Tbl2ID ||
13787 Tbl2->getOperand(0) != Tbl2ID)
13788 return SDValue();
13789
13790 if (Tbl1->getValueType(0) != MVT::v16i8 ||
13791 Tbl2->getValueType(0) != MVT::v16i8)
13792 return SDValue();
13793
13794 SDValue Mask1 = Tbl1->getOperand(3);
13795 SDValue Mask2 = Tbl2->getOperand(3);
13796 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13797 for (unsigned I = 0; I < 16; I++) {
13798 if (ShuffleMask[I] < 16)
13799 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
13800 else {
13801 auto *C =
13802 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
13803 if (!C)
13804 return SDValue();
13805 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
13806 }
13807 }
13808
13809 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
13810 SDValue ID =
13811 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
13812
13813 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
13814 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
13815 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
13816}
13817
13818// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13819// but we don't have an appropriate instruction,
13820// so custom-lower it as ZIP1-with-zeros.
13821SDValue
13822AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13823 SelectionDAG &DAG) const {
13824 SDLoc dl(Op);
13825 EVT VT = Op.getValueType();
13826 SDValue SrcOp = Op.getOperand(0);
13827 EVT SrcVT = SrcOp.getValueType();
13828 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13829 "Unexpected extension factor.");
13830 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13831 // FIXME: support multi-step zipping?
13832 if (Scale != 2)
13833 return SDValue();
13834 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
13835 return DAG.getBitcast(VT,
13836 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
13837}
13838
13839SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13840 SelectionDAG &DAG) const {
13841 SDLoc dl(Op);
13842 EVT VT = Op.getValueType();
13843
13844 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
13845
13846 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13847 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13848
13849 // Convert shuffles that are directly supported on NEON to target-specific
13850 // DAG nodes, instead of keeping them as shuffles and matching them again
13851 // during code selection. This is more efficient and avoids the possibility
13852 // of inconsistencies between legalization and selection.
13853 ArrayRef<int> ShuffleMask = SVN->getMask();
13854
13855 SDValue V1 = Op.getOperand(0);
13856 SDValue V2 = Op.getOperand(1);
13857
13858 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13859 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13860 "Unexpected VECTOR_SHUFFLE mask size!");
13861
13862 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13863 return Res;
13864
13865 if (SVN->isSplat()) {
13866 int Lane = SVN->getSplatIndex();
13867 // If this is undef splat, generate it via "just" vdup, if possible.
13868 if (Lane == -1)
13869 Lane = 0;
13870
13871 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13872 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
13873 V1.getOperand(0));
13874 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13875 // constant. If so, we can just reference the lane's definition directly.
13876 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13877 !isa<ConstantSDNode>(V1.getOperand(Lane)))
13878 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
13879
13880 // Otherwise, duplicate from the lane of the input vector.
13881 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
13882 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
13883 }
13884
13885 // Check if the mask matches a DUP for a wider element
13886 for (unsigned LaneSize : {64U, 32U, 16U}) {
13887 unsigned Lane = 0;
13888 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
13889 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
13890 : LaneSize == 32 ? AArch64ISD::DUPLANE32
13892 // Cast V1 to an integer vector with required lane size
13893 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
13894 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13895 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
13896 V1 = DAG.getBitcast(NewVecTy, V1);
13897 // Constuct the DUP instruction
13898 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
13899 // Cast back to the original type
13900 return DAG.getBitcast(VT, V1);
13901 }
13902 }
13903
13904 unsigned NumElts = VT.getVectorNumElements();
13905 unsigned EltSize = VT.getScalarSizeInBits();
13906 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
13907 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1);
13908 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
13909 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1);
13910 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
13911 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1);
13912
13913 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
13914 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
13915 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
13916 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
13917 DAG.getConstant(8, dl, MVT::i32));
13918 }
13919
13920 bool ReverseEXT = false;
13921 unsigned Imm;
13922 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
13923 if (ReverseEXT)
13924 std::swap(V1, V2);
13925 Imm *= getExtFactor(V1);
13926 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
13927 DAG.getConstant(Imm, dl, MVT::i32));
13928 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
13929 Imm *= getExtFactor(V1);
13930 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
13931 DAG.getConstant(Imm, dl, MVT::i32));
13932 }
13933
13934 unsigned WhichResult;
13935 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13936 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13937 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13938 }
13939 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13940 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13941 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13942 }
13943 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13944 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13945 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13946 }
13947
13948 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13949 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13950 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13951 }
13952 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13953 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13954 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13955 }
13956 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13957 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13958 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13959 }
13960
13962 return Concat;
13963
13964 bool DstIsLeft;
13965 int Anomaly;
13966 int NumInputElements = V1.getValueType().getVectorNumElements();
13967 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13968 SDValue DstVec = DstIsLeft ? V1 : V2;
13969 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
13970
13971 SDValue SrcVec = V1;
13972 int SrcLane = ShuffleMask[Anomaly];
13973 if (SrcLane >= NumInputElements) {
13974 SrcVec = V2;
13975 SrcLane -= NumElts;
13976 }
13977 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
13978
13979 EVT ScalarVT = VT.getVectorElementType();
13980
13981 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13982 ScalarVT = MVT::i32;
13983
13984 return DAG.getNode(
13985 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
13986 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
13987 DstLaneV);
13988 }
13989
13990 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13991 return NewSD;
13992
13993 // If the shuffle is not directly supported and it has 4 elements, use
13994 // the PerfectShuffle-generated table to synthesize it from other shuffles.
13995 if (NumElts == 4) {
13996 unsigned PFIndexes[4];
13997 for (unsigned i = 0; i != 4; ++i) {
13998 if (ShuffleMask[i] < 0)
13999 PFIndexes[i] = 8;
14000 else
14001 PFIndexes[i] = ShuffleMask[i];
14002 }
14003
14004 // Compute the index in the perfect shuffle table.
14005 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14006 PFIndexes[2] * 9 + PFIndexes[3];
14007 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14008 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14009 dl);
14010 }
14011
14012 // Check for a "select shuffle", generating a BSL to pick between lanes in
14013 // V1/V2.
14014 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14015 assert(VT.getScalarSizeInBits() <= 32 &&
14016 "Expected larger vector element sizes to be handled already");
14017 SmallVector<SDValue> MaskElts;
14018 for (int M : ShuffleMask)
14019 MaskElts.push_back(DAG.getConstant(
14020 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, dl, MVT::i32));
14022 SDValue MaskConst = DAG.getBuildVector(IVT, dl, MaskElts);
14023 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, dl, IVT, MaskConst,
14024 DAG.getBitcast(IVT, V1),
14025 DAG.getBitcast(IVT, V2)));
14026 }
14027
14028 // Fall back to generating a TBL
14029 return GenerateTBL(Op, ShuffleMask, DAG);
14030}
14031
14032SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14033 SelectionDAG &DAG) const {
14034 EVT VT = Op.getValueType();
14035
14036 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14037 return LowerToScalableOp(Op, DAG);
14038
14039 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14040 "Unexpected vector type!");
14041
14042 // We can handle the constant cases during isel.
14043 if (isa<ConstantSDNode>(Op.getOperand(0)))
14044 return Op;
14045
14046 // There isn't a natural way to handle the general i1 case, so we use some
14047 // trickery with whilelo.
14048 SDLoc DL(Op);
14049 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14050 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14051 DAG.getValueType(MVT::i1));
14052 SDValue ID =
14053 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14054 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14055 if (VT == MVT::nxv1i1)
14056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14057 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14058 Zero, SplatVal),
14059 Zero);
14060 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14061}
14062
14063SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14064 SelectionDAG &DAG) const {
14065 SDLoc DL(Op);
14066
14067 EVT VT = Op.getValueType();
14068 if (!isTypeLegal(VT) || !VT.isScalableVector())
14069 return SDValue();
14070
14071 // Current lowering only supports the SVE-ACLE types.
14073 return SDValue();
14074
14075 // The DUPQ operation is independent of element type so normalise to i64s.
14076 SDValue Idx128 = Op.getOperand(2);
14077
14078 // DUPQ can be used when idx is in range.
14079 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14080 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14081 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14082 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14083 }
14084
14085 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14086
14087 // The ACLE says this must produce the same result as:
14088 // svtbl(data, svadd_x(svptrue_b64(),
14089 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14090 // index * 2))
14091 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14092 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14093
14094 // create the vector 0,1,0,1,...
14095 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14096 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14097
14098 // create the vector idx64,idx64+1,idx64,idx64+1,...
14099 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14100 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14101 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14102
14103 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14104 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14105 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14106}
14107
14108
14109static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14110 APInt &UndefBits) {
14111 EVT VT = BVN->getValueType(0);
14112 APInt SplatBits, SplatUndef;
14113 unsigned SplatBitSize;
14114 bool HasAnyUndefs;
14115 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14116 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14117
14118 for (unsigned i = 0; i < NumSplats; ++i) {
14119 CnstBits <<= SplatBitSize;
14120 UndefBits <<= SplatBitSize;
14121 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14122 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14123 }
14124
14125 return true;
14126 }
14127
14128 return false;
14129}
14130
14131// Try 64-bit splatted SIMD immediate.
14132static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14133 const APInt &Bits) {
14134 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14135 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14136 EVT VT = Op.getValueType();
14137 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14138
14141
14142 SDLoc dl(Op);
14143 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14144 DAG.getConstant(Value, dl, MVT::i32));
14145 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14146 }
14147 }
14148
14149 return SDValue();
14150}
14151
14152// Try 32-bit splatted SIMD immediate.
14153static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14154 const APInt &Bits,
14155 const SDValue *LHS = nullptr) {
14156 EVT VT = Op.getValueType();
14157 if (VT.isFixedLengthVector() &&
14159 return SDValue();
14160
14161 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14162 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14163 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14164 bool isAdvSIMDModImm = false;
14165 uint64_t Shift;
14166
14167 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14169 Shift = 0;
14170 }
14171 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14173 Shift = 8;
14174 }
14175 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14177 Shift = 16;
14178 }
14179 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14181 Shift = 24;
14182 }
14183
14184 if (isAdvSIMDModImm) {
14185 SDLoc dl(Op);
14186 SDValue Mov;
14187
14188 if (LHS)
14189 Mov = DAG.getNode(NewOp, dl, MovTy,
14190 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14191 DAG.getConstant(Value, dl, MVT::i32),
14192 DAG.getConstant(Shift, dl, MVT::i32));
14193 else
14194 Mov = DAG.getNode(NewOp, dl, MovTy,
14195 DAG.getConstant(Value, dl, MVT::i32),
14196 DAG.getConstant(Shift, dl, MVT::i32));
14197
14198 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14199 }
14200 }
14201
14202 return SDValue();
14203}
14204
14205// Try 16-bit splatted SIMD immediate.
14206static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14207 const APInt &Bits,
14208 const SDValue *LHS = nullptr) {
14209 EVT VT = Op.getValueType();
14210 if (VT.isFixedLengthVector() &&
14212 return SDValue();
14213
14214 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14215 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14216 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14217 bool isAdvSIMDModImm = false;
14218 uint64_t Shift;
14219
14220 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14222 Shift = 0;
14223 }
14224 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14226 Shift = 8;
14227 }
14228
14229 if (isAdvSIMDModImm) {
14230 SDLoc dl(Op);
14231 SDValue Mov;
14232
14233 if (LHS)
14234 Mov = DAG.getNode(NewOp, dl, MovTy,
14235 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14236 DAG.getConstant(Value, dl, MVT::i32),
14237 DAG.getConstant(Shift, dl, MVT::i32));
14238 else
14239 Mov = DAG.getNode(NewOp, dl, MovTy,
14240 DAG.getConstant(Value, dl, MVT::i32),
14241 DAG.getConstant(Shift, dl, MVT::i32));
14242
14243 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14244 }
14245 }
14246
14247 return SDValue();
14248}
14249
14250// Try 32-bit splatted SIMD immediate with shifted ones.
14252 SelectionDAG &DAG, const APInt &Bits) {
14253 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14254 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14255 EVT VT = Op.getValueType();
14256 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14257 bool isAdvSIMDModImm = false;
14258 uint64_t Shift;
14259
14260 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14262 Shift = 264;
14263 }
14264 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14266 Shift = 272;
14267 }
14268
14269 if (isAdvSIMDModImm) {
14270 SDLoc dl(Op);
14271 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14272 DAG.getConstant(Value, dl, MVT::i32),
14273 DAG.getConstant(Shift, dl, MVT::i32));
14274 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14275 }
14276 }
14277
14278 return SDValue();
14279}
14280
14281// Try 8-bit splatted SIMD immediate.
14282static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14283 const APInt &Bits) {
14284 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14285 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14286 EVT VT = Op.getValueType();
14287 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14288
14291
14292 SDLoc dl(Op);
14293 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14294 DAG.getConstant(Value, dl, MVT::i32));
14295 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14296 }
14297 }
14298
14299 return SDValue();
14300}
14301
14302// Try FP splatted SIMD immediate.
14303static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14304 const APInt &Bits) {
14305 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14306 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14307 EVT VT = Op.getValueType();
14308 bool isWide = (VT.getSizeInBits() == 128);
14309 MVT MovTy;
14310 bool isAdvSIMDModImm = false;
14311
14312 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14314 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14315 }
14316 else if (isWide &&
14317 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14319 MovTy = MVT::v2f64;
14320 }
14321
14322 if (isAdvSIMDModImm) {
14323 SDLoc dl(Op);
14324 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14325 DAG.getConstant(Value, dl, MVT::i32));
14326 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14327 }
14328 }
14329
14330 return SDValue();
14331}
14332
14333// Specialized code to quickly find if PotentialBVec is a BuildVector that
14334// consists of only the same constant int value, returned in reference arg
14335// ConstVal
14336static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14337 uint64_t &ConstVal) {
14338 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14339 if (!Bvec)
14340 return false;
14341 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
14342 if (!FirstElt)
14343 return false;
14344 EVT VT = Bvec->getValueType(0);
14345 unsigned NumElts = VT.getVectorNumElements();
14346 for (unsigned i = 1; i < NumElts; ++i)
14347 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14348 return false;
14349 ConstVal = FirstElt->getZExtValue();
14350 return true;
14351}
14352
14354 // Look through cast.
14355 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14356 N = N.getOperand(0);
14357
14358 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14359}
14360
14362 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14363
14364 // Look through cast.
14365 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14366 N = N.getOperand(0);
14367 // When reinterpreting from a type with fewer elements the "new" elements
14368 // are not active, so bail if they're likely to be used.
14369 if (N.getValueType().getVectorMinNumElements() < NumElts)
14370 return false;
14371 }
14372
14373 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14374 return true;
14375
14376 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14377 // or smaller than the implicit element type represented by N.
14378 // NOTE: A larger element count implies a smaller element type.
14379 if (N.getOpcode() == AArch64ISD::PTRUE &&
14380 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14381 return N.getValueType().getVectorMinNumElements() >= NumElts;
14382
14383 // If we're compiling for a specific vector-length, we can check if the
14384 // pattern's VL equals that of the scalable vector at runtime.
14385 if (N.getOpcode() == AArch64ISD::PTRUE) {
14386 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14387 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14388 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14389 if (MaxSVESize && MinSVESize == MaxSVESize) {
14390 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14391 unsigned PatNumElts =
14392 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14393 return PatNumElts == (NumElts * VScale);
14394 }
14395 }
14396
14397 return false;
14398}
14399
14400// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14401// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14402// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14403// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14404// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14405// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14407 EVT VT = N->getValueType(0);
14408
14409 if (!VT.isVector())
14410 return SDValue();
14411
14412 SDLoc DL(N);
14413
14414 SDValue And;
14415 SDValue Shift;
14416
14417 SDValue FirstOp = N->getOperand(0);
14418 unsigned FirstOpc = FirstOp.getOpcode();
14419 SDValue SecondOp = N->getOperand(1);
14420 unsigned SecondOpc = SecondOp.getOpcode();
14421
14422 // Is one of the operands an AND or a BICi? The AND may have been optimised to
14423 // a BICi in order to use an immediate instead of a register.
14424 // Is the other operand an shl or lshr? This will have been turned into:
14425 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14426 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14427 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14428 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14429 SecondOpc == AArch64ISD::SHL_PRED ||
14430 SecondOpc == AArch64ISD::SRL_PRED)) {
14431 And = FirstOp;
14432 Shift = SecondOp;
14433
14434 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14435 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14436 FirstOpc == AArch64ISD::SHL_PRED ||
14437 FirstOpc == AArch64ISD::SRL_PRED)) {
14438 And = SecondOp;
14439 Shift = FirstOp;
14440 } else
14441 return SDValue();
14442
14443 bool IsAnd = And.getOpcode() == ISD::AND;
14444 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14446 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14448
14449 // Is the shift amount constant and are all lanes active?
14450 uint64_t C2;
14451 if (ShiftHasPredOp) {
14452 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
14453 return SDValue();
14454 APInt C;
14456 return SDValue();
14457 C2 = C.getZExtValue();
14458 } else if (ConstantSDNode *C2node =
14459 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
14460 C2 = C2node->getZExtValue();
14461 else
14462 return SDValue();
14463
14464 APInt C1AsAPInt;
14465 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14466 if (IsAnd) {
14467 // Is the and mask vector all constant?
14468 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
14469 return SDValue();
14470 } else {
14471 // Reconstruct the corresponding AND immediate from the two BICi immediates.
14472 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
14473 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
14474 assert(C1nodeImm && C1nodeShift);
14475 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14476 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
14477 }
14478
14479 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14480 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14481 // how much one can shift elements of a particular size?
14482 if (C2 > ElemSizeInBits)
14483 return SDValue();
14484
14485 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
14486 : APInt::getLowBitsSet(ElemSizeInBits, C2);
14487 if (C1AsAPInt != RequiredC1)
14488 return SDValue();
14489
14490 SDValue X = And.getOperand(0);
14491 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
14492 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
14493 : Shift.getOperand(1);
14494
14495 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14496 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
14497
14498 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
14499 LLVM_DEBUG(N->dump(&DAG));
14500 LLVM_DEBUG(dbgs() << "into: \n");
14501 LLVM_DEBUG(ResultSLI->dump(&DAG));
14502
14503 ++NumShiftInserts;
14504 return ResultSLI;
14505}
14506
14507SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14508 SelectionDAG &DAG) const {
14509 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14510 !Subtarget->isNeonAvailable()))
14511 return LowerToScalableOp(Op, DAG);
14512
14513 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14514 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
14515 return Res;
14516
14517 EVT VT = Op.getValueType();
14518 if (VT.isScalableVector())
14519 return Op;
14520
14521 SDValue LHS = Op.getOperand(0);
14522 BuildVectorSDNode *BVN =
14523 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
14524 if (!BVN) {
14525 // OR commutes, so try swapping the operands.
14526 LHS = Op.getOperand(1);
14527 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
14528 }
14529 if (!BVN)
14530 return Op;
14531
14532 APInt DefBits(VT.getSizeInBits(), 0);
14533 APInt UndefBits(VT.getSizeInBits(), 0);
14534 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14535 SDValue NewOp;
14536
14537 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14538 DefBits, &LHS)) ||
14539 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14540 DefBits, &LHS)))
14541 return NewOp;
14542
14543 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14544 UndefBits, &LHS)) ||
14545 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14546 UndefBits, &LHS)))
14547 return NewOp;
14548 }
14549
14550 // We can always fall back to a non-immediate OR.
14551 return Op;
14552}
14553
14554// Normalize the operands of BUILD_VECTOR. The value of constant operands will
14555// be truncated to fit element width.
14557 SelectionDAG &DAG) {
14558 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14559 SDLoc dl(Op);
14560 EVT VT = Op.getValueType();
14561 EVT EltTy= VT.getVectorElementType();
14562
14563 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
14564 return Op;
14565
14567 for (SDValue Lane : Op->ops()) {
14568 // For integer vectors, type legalization would have promoted the
14569 // operands already. Otherwise, if Op is a floating-point splat
14570 // (with operands cast to integers), then the only possibilities
14571 // are constants and UNDEFs.
14572 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
14573 Lane = DAG.getConstant(
14574 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
14575 dl, MVT::i32);
14576 } else if (Lane.getNode()->isUndef()) {
14577 Lane = DAG.getUNDEF(MVT::i32);
14578 } else {
14579 assert(Lane.getValueType() == MVT::i32 &&
14580 "Unexpected BUILD_VECTOR operand type");
14581 }
14582 Ops.push_back(Lane);
14583 }
14584 return DAG.getBuildVector(VT, dl, Ops);
14585}
14586
14588 const AArch64Subtarget *ST) {
14589 EVT VT = Op.getValueType();
14590 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
14591 "Expected a legal NEON vector");
14592
14593 APInt DefBits(VT.getSizeInBits(), 0);
14594 APInt UndefBits(VT.getSizeInBits(), 0);
14595 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14596 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14597 auto TryMOVIWithBits = [&](APInt DefBits) {
14598 SDValue NewOp;
14599 if ((NewOp =
14600 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
14601 (NewOp =
14602 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14603 (NewOp =
14604 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
14605 (NewOp =
14606 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14607 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
14608 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
14609 return NewOp;
14610
14611 APInt NotDefBits = ~DefBits;
14612 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
14613 NotDefBits)) ||
14615 NotDefBits)) ||
14616 (NewOp =
14617 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
14618 return NewOp;
14619 return SDValue();
14620 };
14621 if (SDValue R = TryMOVIWithBits(DefBits))
14622 return R;
14623 if (SDValue R = TryMOVIWithBits(UndefBits))
14624 return R;
14625
14626 // See if a fneg of the constant can be materialized with a MOVI, etc
14627 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
14628 // FNegate each sub-element of the constant
14629 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
14630 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
14631 .zext(VT.getSizeInBits());
14632 APInt NegBits(VT.getSizeInBits(), 0);
14633 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
14634 for (unsigned i = 0; i < NumElts; i++)
14635 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
14636 NegBits = DefBits ^ NegBits;
14637
14638 // Try to create the new constants with MOVI, and if so generate a fneg
14639 // for it.
14640 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
14641 SDLoc DL(Op);
14642 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
14643 return DAG.getNode(
14645 DAG.getNode(ISD::FNEG, DL, VFVT,
14646 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
14647 }
14648 return SDValue();
14649 };
14650 SDValue R;
14651 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
14652 (R = TryWithFNeg(DefBits, MVT::f64)) ||
14653 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
14654 return R;
14655 }
14656
14657 return SDValue();
14658}
14659
14660SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
14661 SDValue Op, SelectionDAG &DAG) const {
14662 EVT VT = Op.getValueType();
14663 SDLoc DL(Op);
14664 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
14665 auto *BVN = cast<BuildVectorSDNode>(Op);
14666
14667 if (auto SeqInfo = BVN->isConstantSequence()) {
14668 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
14669 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
14670 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
14671 return convertFromScalableVector(DAG, VT, Seq);
14672 }
14673
14674 unsigned NumElems = VT.getVectorNumElements();
14675 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
14676 NumElems <= 1 || BVN->isConstant())
14677 return SDValue();
14678
14679 auto IsExtractElt = [](SDValue Op) {
14680 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
14681 };
14682
14683 // For integer types that are not already in vectors limit to at most four
14684 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
14685 if (VT.getScalarType().isInteger() &&
14686 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
14687 return SDValue();
14688
14689 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
14690 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
14691 SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
14692 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
14693 return Op.isUndef() ? Undef
14694 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
14695 ContainerVT, Undef, Op, ZeroI64);
14696 });
14697
14698 ElementCount ZipEC = ContainerVT.getVectorElementCount();
14699 while (Intermediates.size() > 1) {
14700 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
14701
14702 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
14703 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
14704 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
14705 Intermediates[I / 2] =
14706 Op1.isUndef() ? Op0
14707 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
14708 }
14709
14710 Intermediates.resize(Intermediates.size() / 2);
14711 ZipEC = ZipEC.divideCoefficientBy(2);
14712 }
14713
14714 assert(Intermediates.size() == 1);
14715 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
14716 return convertFromScalableVector(DAG, VT, Vec);
14717}
14718
14719SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
14720 SelectionDAG &DAG) const {
14721 EVT VT = Op.getValueType();
14722
14723 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14724 cast<BuildVectorSDNode>(Op)->isConstantSequence();
14725 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
14726 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
14727
14728 // Try to build a simple constant vector.
14729 Op = NormalizeBuildVector(Op, DAG);
14730 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
14731 // abort.
14732 if (Op.getOpcode() != ISD::BUILD_VECTOR)
14733 return SDValue();
14734
14735 // Certain vector constants, used to express things like logical NOT and
14736 // arithmetic NEG, are passed through unmodified. This allows special
14737 // patterns for these operations to match, which will lower these constants
14738 // to whatever is proven necessary.
14739 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14740 if (BVN->isConstant()) {
14741 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
14742 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
14743 APInt Val(BitSize,
14744 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
14745 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
14746 return Op;
14747 }
14748 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
14749 if (Const->isZero() && !Const->isNegative())
14750 return Op;
14751 }
14752
14753 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
14754 return V;
14755
14756 // Scan through the operands to find some interesting properties we can
14757 // exploit:
14758 // 1) If only one value is used, we can use a DUP, or
14759 // 2) if only the low element is not undef, we can just insert that, or
14760 // 3) if only one constant value is used (w/ some non-constant lanes),
14761 // we can splat the constant value into the whole vector then fill
14762 // in the non-constant lanes.
14763 // 4) FIXME: If different constant values are used, but we can intelligently
14764 // select the values we'll be overwriting for the non-constant
14765 // lanes such that we can directly materialize the vector
14766 // some other way (MOVI, e.g.), we can be sneaky.
14767 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
14768 SDLoc dl(Op);
14769 unsigned NumElts = VT.getVectorNumElements();
14770 bool isOnlyLowElement = true;
14771 bool usesOnlyOneValue = true;
14772 bool usesOnlyOneConstantValue = true;
14773 bool isConstant = true;
14774 bool AllLanesExtractElt = true;
14775 unsigned NumConstantLanes = 0;
14776 unsigned NumDifferentLanes = 0;
14777 unsigned NumUndefLanes = 0;
14778 SDValue Value;
14779 SDValue ConstantValue;
14780 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14781 unsigned ConsecutiveValCount = 0;
14782 SDValue PrevVal;
14783 for (unsigned i = 0; i < NumElts; ++i) {
14784 SDValue V = Op.getOperand(i);
14785 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14786 AllLanesExtractElt = false;
14787 if (V.isUndef()) {
14788 ++NumUndefLanes;
14789 continue;
14790 }
14791 if (i > 0)
14792 isOnlyLowElement = false;
14793 if (!isIntOrFPConstant(V))
14794 isConstant = false;
14795
14796 if (isIntOrFPConstant(V)) {
14797 ++NumConstantLanes;
14798 if (!ConstantValue.getNode())
14799 ConstantValue = V;
14800 else if (ConstantValue != V)
14801 usesOnlyOneConstantValue = false;
14802 }
14803
14804 if (!Value.getNode())
14805 Value = V;
14806 else if (V != Value) {
14807 usesOnlyOneValue = false;
14808 ++NumDifferentLanes;
14809 }
14810
14811 if (PrevVal != V) {
14812 ConsecutiveValCount = 0;
14813 PrevVal = V;
14814 }
14815
14816 // Keep different values and its last consecutive count. For example,
14817 //
14818 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14819 // t24, t24, t24, t24, t24, t24, t24, t24
14820 // t23 = consecutive count 8
14821 // t24 = consecutive count 8
14822 // ------------------------------------------------------------------
14823 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14824 // t24, t24, t24, t24, t24, t24, t24, t24
14825 // t23 = consecutive count 5
14826 // t24 = consecutive count 9
14827 DifferentValueMap[V] = ++ConsecutiveValCount;
14828 }
14829
14830 if (!Value.getNode()) {
14831 LLVM_DEBUG(
14832 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14833 return DAG.getUNDEF(VT);
14834 }
14835
14836 // Convert BUILD_VECTOR where all elements but the lowest are undef into
14837 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14838 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14839 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
14840 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14841 "SCALAR_TO_VECTOR node\n");
14842 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
14843 }
14844
14845 if (AllLanesExtractElt) {
14846 SDNode *Vector = nullptr;
14847 bool Even = false;
14848 bool Odd = false;
14849 // Check whether the extract elements match the Even pattern <0,2,4,...> or
14850 // the Odd pattern <1,3,5,...>.
14851 for (unsigned i = 0; i < NumElts; ++i) {
14852 SDValue V = Op.getOperand(i);
14853 const SDNode *N = V.getNode();
14854 if (!isa<ConstantSDNode>(N->getOperand(1))) {
14855 Even = false;
14856 Odd = false;
14857 break;
14858 }
14859 SDValue N0 = N->getOperand(0);
14860
14861 // All elements are extracted from the same vector.
14862 if (!Vector) {
14863 Vector = N0.getNode();
14864 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14865 // BUILD_VECTOR.
14866 if (VT.getVectorElementType() !=
14868 break;
14869 } else if (Vector != N0.getNode()) {
14870 Odd = false;
14871 Even = false;
14872 break;
14873 }
14874
14875 // Extracted values are either at Even indices <0,2,4,...> or at Odd
14876 // indices <1,3,5,...>.
14877 uint64_t Val = N->getConstantOperandVal(1);
14878 if (Val == 2 * i) {
14879 Even = true;
14880 continue;
14881 }
14882 if (Val - 1 == 2 * i) {
14883 Odd = true;
14884 continue;
14885 }
14886
14887 // Something does not match: abort.
14888 Odd = false;
14889 Even = false;
14890 break;
14891 }
14892 if (Even || Odd) {
14893 SDValue LHS =
14895 DAG.getConstant(0, dl, MVT::i64));
14896 SDValue RHS =
14898 DAG.getConstant(NumElts, dl, MVT::i64));
14899
14900 if (Even && !Odd)
14901 return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
14902 if (Odd && !Even)
14903 return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
14904 }
14905 }
14906
14907 // Use DUP for non-constant splats. For f32 constant splats, reduce to
14908 // i32 and try again.
14909 if (usesOnlyOneValue) {
14910 if (!isConstant) {
14911 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14912 Value.getValueType() != VT) {
14913 LLVM_DEBUG(
14914 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14915 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
14916 }
14917
14918 // This is actually a DUPLANExx operation, which keeps everything vectory.
14919
14920 SDValue Lane = Value.getOperand(1);
14921 Value = Value.getOperand(0);
14922 if (Value.getValueSizeInBits() == 64) {
14923 LLVM_DEBUG(
14924 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14925 "widening it\n");
14926 Value = WidenVector(Value, DAG);
14927 }
14928
14929 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
14930 return DAG.getNode(Opcode, dl, VT, Value, Lane);
14931 }
14932
14935 EVT EltTy = VT.getVectorElementType();
14936 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
14937 EltTy == MVT::f64) && "Unsupported floating-point vector type");
14938 LLVM_DEBUG(
14939 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14940 "BITCASTS, and try again\n");
14941 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
14942 for (unsigned i = 0; i < NumElts; ++i)
14943 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
14944 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
14945 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
14946 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14947 Val.dump(););
14948 Val = LowerBUILD_VECTOR(Val, DAG);
14949 if (Val.getNode())
14950 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
14951 }
14952 }
14953
14954 // If we need to insert a small number of different non-constant elements and
14955 // the vector width is sufficiently large, prefer using DUP with the common
14956 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14957 // skip the constant lane handling below.
14958 bool PreferDUPAndInsert =
14959 !isConstant && NumDifferentLanes >= 1 &&
14960 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
14961 NumDifferentLanes >= NumConstantLanes;
14962
14963 // If there was only one constant value used and for more than one lane,
14964 // start by splatting that value, then replace the non-constant lanes. This
14965 // is better than the default, which will perform a separate initialization
14966 // for each lane.
14967 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
14968 // Firstly, try to materialize the splat constant.
14969 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
14970 unsigned BitSize = VT.getScalarSizeInBits();
14971 APInt ConstantValueAPInt(1, 0);
14972 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
14973 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
14974 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
14975 !ConstantValueAPInt.isAllOnes()) {
14976 Val = ConstantBuildVector(Val, DAG, Subtarget);
14977 if (!Val)
14978 // Otherwise, materialize the constant and splat it.
14979 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
14980 }
14981
14982 // Now insert the non-constant lanes.
14983 for (unsigned i = 0; i < NumElts; ++i) {
14984 SDValue V = Op.getOperand(i);
14985 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14986 if (!isIntOrFPConstant(V))
14987 // Note that type legalization likely mucked about with the VT of the
14988 // source operand, so we may have to convert it here before inserting.
14989 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
14990 }
14991 return Val;
14992 }
14993
14994 // This will generate a load from the constant pool.
14995 if (isConstant) {
14996 LLVM_DEBUG(
14997 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
14998 "expansion\n");
14999 return SDValue();
15000 }
15001
15002 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15003 // v4i32s. This is really a truncate, which we can construct out of (legal)
15004 // concats and truncate nodes.
15006 return M;
15007
15008 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15009 if (NumElts >= 4) {
15010 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15011 return Shuffle;
15012
15013 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15014 return Shuffle;
15015 }
15016
15017 if (PreferDUPAndInsert) {
15018 // First, build a constant vector with the common element.
15019 SmallVector<SDValue, 8> Ops(NumElts, Value);
15020 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
15021 // Next, insert the elements that do not match the common value.
15022 for (unsigned I = 0; I < NumElts; ++I)
15023 if (Op.getOperand(I) != Value)
15024 NewVector =
15025 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
15026 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
15027
15028 return NewVector;
15029 }
15030
15031 // If vector consists of two different values, try to generate two DUPs and
15032 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15033 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15035 // Check the consecutive count of the value is the half number of vector
15036 // elements. In this case, we can use CONCAT_VECTORS. For example,
15037 //
15038 // canUseVECTOR_CONCAT = true;
15039 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15040 // t24, t24, t24, t24, t24, t24, t24, t24
15041 //
15042 // canUseVECTOR_CONCAT = false;
15043 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15044 // t24, t24, t24, t24, t24, t24, t24, t24
15045 bool canUseVECTOR_CONCAT = true;
15046 for (auto Pair : DifferentValueMap) {
15047 // Check different values have same length which is NumElts / 2.
15048 if (Pair.second != NumElts / 2)
15049 canUseVECTOR_CONCAT = false;
15050 Vals.push_back(Pair.first);
15051 }
15052
15053 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15054 // CONCAT_VECTORs. For example,
15055 //
15056 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15057 // t24, t24, t24, t24, t24, t24, t24, t24
15058 // ==>
15059 // t26: v8i8 = AArch64ISD::DUP t23
15060 // t28: v8i8 = AArch64ISD::DUP t24
15061 // t29: v16i8 = concat_vectors t26, t28
15062 if (canUseVECTOR_CONCAT) {
15063 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15064 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15065 SubVT.getVectorNumElements() >= 2) {
15066 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15067 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15068 SDValue DUP1 =
15069 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
15070 SDValue DUP2 =
15071 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
15073 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
15074 return CONCAT_VECTORS;
15075 }
15076 }
15077
15078 // Let's try to generate VECTOR_SHUFFLE. For example,
15079 //
15080 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15081 // ==>
15082 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15083 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15084 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15085 if (NumElts >= 8) {
15086 SmallVector<int, 16> MaskVec;
15087 // Build mask for VECTOR_SHUFLLE.
15088 SDValue FirstLaneVal = Op.getOperand(0);
15089 for (unsigned i = 0; i < NumElts; ++i) {
15090 SDValue Val = Op.getOperand(i);
15091 if (FirstLaneVal == Val)
15092 MaskVec.push_back(i);
15093 else
15094 MaskVec.push_back(i + NumElts);
15095 }
15096
15097 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15098 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15099 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
15100 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
15102 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
15103 return VECTOR_SHUFFLE;
15104 }
15105 }
15106
15107 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15108 // know the default expansion would otherwise fall back on something even
15109 // worse. For a vector with one or two non-undef values, that's
15110 // scalar_to_vector for the elements followed by a shuffle (provided the
15111 // shuffle is valid for the target) and materialization element by element
15112 // on the stack followed by a load for everything else.
15113 if (!isConstant && !usesOnlyOneValue) {
15114 LLVM_DEBUG(
15115 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15116 "of INSERT_VECTOR_ELT\n");
15117
15118 SDValue Vec = DAG.getUNDEF(VT);
15119 SDValue Op0 = Op.getOperand(0);
15120 unsigned i = 0;
15121
15122 // Use SCALAR_TO_VECTOR for lane zero to
15123 // a) Avoid a RMW dependency on the full vector register, and
15124 // b) Allow the register coalescer to fold away the copy if the
15125 // value is already in an S or D register, and we're forced to emit an
15126 // INSERT_SUBREG that we can't fold anywhere.
15127 //
15128 // We also allow types like i8 and i16 which are illegal scalar but legal
15129 // vector element types. After type-legalization the inserted value is
15130 // extended (i32) and it is safe to cast them to the vector type by ignoring
15131 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15132 if (!Op0.isUndef()) {
15133 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15134 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
15135 ++i;
15136 }
15137 LLVM_DEBUG({
15138 if (i < NumElts)
15139 dbgs() << "Creating nodes for the other vector elements:\n";
15140 });
15141 for (; i < NumElts; ++i) {
15142 SDValue V = Op.getOperand(i);
15143 if (V.isUndef())
15144 continue;
15145 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
15146 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
15147 }
15148 return Vec;
15149 }
15150
15151 LLVM_DEBUG(
15152 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15153 "better alternative\n");
15154 return SDValue();
15155}
15156
15157SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15158 SelectionDAG &DAG) const {
15159 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15160 !Subtarget->isNeonAvailable()))
15161 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15162
15163 assert(Op.getValueType().isScalableVector() &&
15164 isTypeLegal(Op.getValueType()) &&
15165 "Expected legal scalable vector type!");
15166
15167 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15168 unsigned NumOperands = Op->getNumOperands();
15169 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15170 "Unexpected number of operands in CONCAT_VECTORS");
15171
15172 if (NumOperands == 2)
15173 return Op;
15174
15175 // Concat each pair of subvectors and pack into the lower half of the array.
15176 SmallVector<SDValue> ConcatOps(Op->ops());
15177 while (ConcatOps.size() > 1) {
15178 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15179 SDValue V1 = ConcatOps[I];
15180 SDValue V2 = ConcatOps[I + 1];
15181 EVT SubVT = V1.getValueType();
15182 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15183 ConcatOps[I / 2] =
15184 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15185 }
15186 ConcatOps.resize(ConcatOps.size() / 2);
15187 }
15188 return ConcatOps[0];
15189 }
15190
15191 return SDValue();
15192}
15193
15194SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15195 SelectionDAG &DAG) const {
15196 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15197
15198 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15199 !Subtarget->isNeonAvailable()))
15200 return LowerFixedLengthInsertVectorElt(Op, DAG);
15201
15202 EVT VT = Op.getOperand(0).getValueType();
15203
15204 if (VT.getScalarType() == MVT::i1) {
15205 EVT VectorVT = getPromotedVTForPredicate(VT);
15206 SDLoc DL(Op);
15207 SDValue ExtendedVector =
15208 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15209 SDValue ExtendedValue =
15210 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15211 VectorVT.getScalarType().getSizeInBits() < 32
15212 ? MVT::i32
15213 : VectorVT.getScalarType());
15214 ExtendedVector =
15215 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15216 ExtendedValue, Op.getOperand(2));
15217 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15218 }
15219
15220 // Check for non-constant or out of range lane.
15221 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15222 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15223 return SDValue();
15224
15225 return Op;
15226}
15227
15228SDValue
15229AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15230 SelectionDAG &DAG) const {
15231 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15232 EVT VT = Op.getOperand(0).getValueType();
15233
15234 if (VT.getScalarType() == MVT::i1) {
15235 // We can't directly extract from an SVE predicate; extend it first.
15236 // (This isn't the only possible lowering, but it's straightforward.)
15237 EVT VectorVT = getPromotedVTForPredicate(VT);
15238 SDLoc DL(Op);
15239 SDValue Extend =
15240 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15241 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15242 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15243 Extend, Op.getOperand(1));
15244 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15245 }
15246
15247 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15248 return LowerFixedLengthExtractVectorElt(Op, DAG);
15249
15250 // Check for non-constant or out of range lane.
15251 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15252 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15253 return SDValue();
15254
15255 // Insertion/extraction are legal for V128 types.
15256 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15257 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15258 VT == MVT::v8f16 || VT == MVT::v8bf16)
15259 return Op;
15260
15261 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15262 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15263 VT != MVT::v4bf16)
15264 return SDValue();
15265
15266 // For V64 types, we perform extraction by expanding the value
15267 // to a V128 type and perform the extraction on that.
15268 SDLoc DL(Op);
15269 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15270 EVT WideTy = WideVec.getValueType();
15271
15272 EVT ExtrTy = WideTy.getVectorElementType();
15273 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15274 ExtrTy = MVT::i32;
15275
15276 // For extractions, we just return the result directly.
15277 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15278 Op.getOperand(1));
15279}
15280
15281SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15282 SelectionDAG &DAG) const {
15283 EVT VT = Op.getValueType();
15285 "Only cases that extract a fixed length vector are supported!");
15286 EVT InVT = Op.getOperand(0).getValueType();
15287
15288 // If we don't have legal types yet, do nothing
15289 if (!isTypeLegal(InVT))
15290 return SDValue();
15291
15292 if (InVT.is128BitVector()) {
15293 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15294 unsigned Idx = Op.getConstantOperandVal(1);
15295
15296 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15297 if (Idx == 0)
15298 return Op;
15299
15300 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15301 // that directly.
15302 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15303 return Op;
15304 }
15305
15306 if (InVT.isScalableVector() ||
15307 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
15308 SDLoc DL(Op);
15309 SDValue Vec = Op.getOperand(0);
15310 SDValue Idx = Op.getOperand(1);
15311
15313 if (PackedVT != InVT) {
15314 // Pack input into the bottom part of an SVE register and try again.
15315 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
15316 DAG.getUNDEF(PackedVT), Vec,
15317 DAG.getVectorIdxConstant(0, DL));
15318 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
15319 }
15320
15321 // This will get matched by custom code during ISelDAGToDAG.
15322 if (isNullConstant(Idx))
15323 return Op;
15324
15325 assert(InVT.isScalableVector() && "Unexpected vector type!");
15326 // Move requested subvector to the start of the vector and try again.
15327 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
15328 return convertFromScalableVector(DAG, VT, Splice);
15329 }
15330
15331 return SDValue();
15332}
15333
15334SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15335 SelectionDAG &DAG) const {
15336 assert(Op.getValueType().isScalableVector() &&
15337 "Only expect to lower inserts into scalable vectors!");
15338
15339 EVT InVT = Op.getOperand(1).getValueType();
15340 unsigned Idx = Op.getConstantOperandVal(2);
15341
15342 SDValue Vec0 = Op.getOperand(0);
15343 SDValue Vec1 = Op.getOperand(1);
15344 SDLoc DL(Op);
15345 EVT VT = Op.getValueType();
15346
15347 if (InVT.isScalableVector()) {
15348 if (!isTypeLegal(VT))
15349 return SDValue();
15350
15351 // Break down insert_subvector into simpler parts.
15352 if (VT.getVectorElementType() == MVT::i1) {
15353 unsigned NumElts = VT.getVectorMinNumElements();
15354 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15355
15356 SDValue Lo, Hi;
15357 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15358 DAG.getVectorIdxConstant(0, DL));
15359 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15360 DAG.getVectorIdxConstant(NumElts / 2, DL));
15361 if (Idx < (NumElts / 2))
15362 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
15364 else
15365 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
15366 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
15367
15368 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15369 }
15370
15371 // We can select these directly.
15372 if (isTypeLegal(InVT) && Vec0.isUndef())
15373 return Op;
15374
15375 // Ensure the subvector is half the size of the main vector.
15376 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15377 return SDValue();
15378
15379 // Here narrow and wide refers to the vector element types. After "casting"
15380 // both vectors must have the same bit length and so because the subvector
15381 // has fewer elements, those elements need to be bigger.
15384
15385 // NOP cast operands to the largest legal vector of the same element count.
15386 if (VT.isFloatingPoint()) {
15387 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
15388 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
15389 } else {
15390 // Legal integer vectors are already their largest so Vec0 is fine as is.
15391 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
15392 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
15393 }
15394
15395 // To replace the top/bottom half of vector V with vector SubV we widen the
15396 // preserved half of V, concatenate this to SubV (the order depending on the
15397 // half being replaced) and then narrow the result.
15398 SDValue Narrow;
15399 if (Idx == 0) {
15400 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
15401 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
15402 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
15403 } else {
15405 "Invalid subvector index!");
15406 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
15407 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
15408 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
15409 }
15410
15411 return getSVESafeBitCast(VT, Narrow, DAG);
15412 }
15413
15414 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15415 // This will be matched by custom code during ISelDAGToDAG.
15416 if (Vec0.isUndef())
15417 return Op;
15418
15419 std::optional<unsigned> PredPattern =
15421 auto PredTy = VT.changeVectorElementType(MVT::i1);
15422 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
15423 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
15424 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
15425 }
15426
15427 return SDValue();
15428}
15429
15430static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15431 if (Op.getOpcode() != AArch64ISD::DUP &&
15432 Op.getOpcode() != ISD::SPLAT_VECTOR &&
15433 Op.getOpcode() != ISD::BUILD_VECTOR)
15434 return false;
15435
15436 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15437 !isAllConstantBuildVector(Op, SplatVal))
15438 return false;
15439
15440 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15441 !isa<ConstantSDNode>(Op->getOperand(0)))
15442 return false;
15443
15444 SplatVal = Op->getConstantOperandVal(0);
15445 if (Op.getValueType().getVectorElementType() != MVT::i64)
15446 SplatVal = (int32_t)SplatVal;
15447
15448 Negated = false;
15449 if (isPowerOf2_64(SplatVal))
15450 return true;
15451
15452 Negated = true;
15453 if (isPowerOf2_64(-SplatVal)) {
15454 SplatVal = -SplatVal;
15455 return true;
15456 }
15457
15458 return false;
15459}
15460
15461SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15462 EVT VT = Op.getValueType();
15463 SDLoc dl(Op);
15464
15465 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
15466 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15467
15468 assert(VT.isScalableVector() && "Expected a scalable vector.");
15469
15470 bool Signed = Op.getOpcode() == ISD::SDIV;
15471 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15472
15473 bool Negated;
15474 uint64_t SplatVal;
15475 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
15476 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
15477 SDValue Res =
15478 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
15479 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
15480 if (Negated)
15481 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
15482
15483 return Res;
15484 }
15485
15486 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
15487 return LowerToPredicatedOp(Op, DAG, PredOpcode);
15488
15489 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15490 // operations, and truncate the result.
15491 EVT WidenedVT;
15492 if (VT == MVT::nxv16i8)
15493 WidenedVT = MVT::nxv8i16;
15494 else if (VT == MVT::nxv8i16)
15495 WidenedVT = MVT::nxv4i32;
15496 else
15497 llvm_unreachable("Unexpected Custom DIV operation");
15498
15499 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15500 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15501 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
15502 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
15503 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
15504 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
15505 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
15506 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
15507 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultLo);
15508 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultHi);
15509 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLoCast, ResultHiCast);
15510}
15511
15512bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15513 EVT VT, unsigned DefinedValues) const {
15514 if (!Subtarget->isNeonAvailable())
15515 return false;
15517}
15518
15520 // Currently no fixed length shuffles that require SVE are legal.
15521 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15522 return false;
15523
15524 if (VT.getVectorNumElements() == 4 &&
15525 (VT.is128BitVector() || VT.is64BitVector())) {
15526 unsigned Cost = getPerfectShuffleCost(M);
15527 if (Cost <= 1)
15528 return true;
15529 }
15530
15531 bool DummyBool;
15532 int DummyInt;
15533 unsigned DummyUnsigned;
15534
15535 unsigned EltSize = VT.getScalarSizeInBits();
15536 unsigned NumElts = VT.getVectorNumElements();
15537 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
15538 isREVMask(M, EltSize, NumElts, 64) ||
15539 isREVMask(M, EltSize, NumElts, 32) ||
15540 isREVMask(M, EltSize, NumElts, 16) ||
15541 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
15542 isTRNMask(M, NumElts, DummyUnsigned) ||
15543 isUZPMask(M, NumElts, DummyUnsigned) ||
15544 isZIPMask(M, NumElts, DummyUnsigned) ||
15545 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
15546 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
15547 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
15548 isINSMask(M, NumElts, DummyBool, DummyInt) ||
15549 isConcatMask(M, VT, VT.getSizeInBits() == 128));
15550}
15551
15553 EVT VT) const {
15554 // Just delegate to the generic legality, clear masks aren't special.
15555 return isShuffleMaskLegal(M, VT);
15556}
15557
15558/// getVShiftImm - Check if this is a valid build_vector for the immediate
15559/// operand of a vector shift operation, where all the elements of the
15560/// build_vector must have the same constant integer value.
15561static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15562 // Ignore bit_converts.
15563 while (Op.getOpcode() == ISD::BITCAST)
15564 Op = Op.getOperand(0);
15565 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
15566 APInt SplatBits, SplatUndef;
15567 unsigned SplatBitSize;
15568 bool HasAnyUndefs;
15569 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
15570 HasAnyUndefs, ElementBits) ||
15571 SplatBitSize > ElementBits)
15572 return false;
15573 Cnt = SplatBits.getSExtValue();
15574 return true;
15575}
15576
15577/// isVShiftLImm - Check if this is a valid build_vector for the immediate
15578/// operand of a vector shift left operation. That value must be in the range:
15579/// 0 <= Value < ElementBits for a left shift; or
15580/// 0 <= Value <= ElementBits for a long left shift.
15581static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
15582 assert(VT.isVector() && "vector shift count is not a vector type");
15583 int64_t ElementBits = VT.getScalarSizeInBits();
15584 if (!getVShiftImm(Op, ElementBits, Cnt))
15585 return false;
15586 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
15587}
15588
15589/// isVShiftRImm - Check if this is a valid build_vector for the immediate
15590/// operand of a vector shift right operation. The value must be in the range:
15591/// 1 <= Value <= ElementBits for a right shift; or
15592static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
15593 assert(VT.isVector() && "vector shift count is not a vector type");
15594 int64_t ElementBits = VT.getScalarSizeInBits();
15595 if (!getVShiftImm(Op, ElementBits, Cnt))
15596 return false;
15597 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
15598}
15599
15600SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
15601 SelectionDAG &DAG) const {
15602 EVT VT = Op.getValueType();
15603
15604 if (VT.getScalarType() == MVT::i1) {
15605 // Lower i1 truncate to `(x & 1) != 0`.
15606 SDLoc dl(Op);
15607 EVT OpVT = Op.getOperand(0).getValueType();
15608 SDValue Zero = DAG.getConstant(0, dl, OpVT);
15609 SDValue One = DAG.getConstant(1, dl, OpVT);
15610 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
15611 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
15612 }
15613
15614 if (!VT.isVector() || VT.isScalableVector())
15615 return SDValue();
15616
15617 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15618 !Subtarget->isNeonAvailable()))
15619 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
15620
15621 return SDValue();
15622}
15623
15624// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
15625// possibly a truncated type, it tells how many bits of the value are to be
15626// used.
15628 SelectionDAG &DAG,
15629 unsigned &ShiftValue,
15630 SDValue &RShOperand) {
15631 if (Shift->getOpcode() != ISD::SRL)
15632 return false;
15633
15634 EVT VT = Shift.getValueType();
15635 assert(VT.isScalableVT());
15636
15637 auto ShiftOp1 =
15638 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
15639 if (!ShiftOp1)
15640 return false;
15641
15642 ShiftValue = ShiftOp1->getZExtValue();
15643 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
15644 return false;
15645
15646 SDValue Add = Shift->getOperand(0);
15647 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
15648 return false;
15649
15651 "ResVT must be truncated or same type as the shift.");
15652 // Check if an overflow can lead to incorrect results.
15653 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
15654 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
15655 return false;
15656
15657 auto AddOp1 =
15658 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
15659 if (!AddOp1)
15660 return false;
15661 uint64_t AddValue = AddOp1->getZExtValue();
15662 if (AddValue != 1ULL << (ShiftValue - 1))
15663 return false;
15664
15665 RShOperand = Add->getOperand(0);
15666 return true;
15667}
15668
15669SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
15670 SelectionDAG &DAG) const {
15671 EVT VT = Op.getValueType();
15672 SDLoc DL(Op);
15673 int64_t Cnt;
15674
15675 if (!Op.getOperand(1).getValueType().isVector())
15676 return Op;
15677 unsigned EltSize = VT.getScalarSizeInBits();
15678
15679 switch (Op.getOpcode()) {
15680 case ISD::SHL:
15681 if (VT.isScalableVector() ||
15683 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
15684
15685 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
15686 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
15687 DAG.getConstant(Cnt, DL, MVT::i32));
15688 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
15689 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
15690 MVT::i32),
15691 Op.getOperand(0), Op.getOperand(1));
15692 case ISD::SRA:
15693 case ISD::SRL:
15694 if (VT.isScalableVector() &&
15695 (Subtarget->hasSVE2() ||
15696 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
15697 SDValue RShOperand;
15698 unsigned ShiftValue;
15699 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
15700 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
15701 getPredicateForVector(DAG, DL, VT), RShOperand,
15702 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
15703 }
15704
15705 if (VT.isScalableVector() ||
15706 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
15707 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
15709 return LowerToPredicatedOp(Op, DAG, Opc);
15710 }
15711
15712 // Right shift immediate
15713 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
15714 unsigned Opc =
15715 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
15716 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
15717 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
15718 }
15719
15720 // Right shift register. Note, there is not a shift right register
15721 // instruction, but the shift left register instruction takes a signed
15722 // value, where negative numbers specify a right shift.
15723 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
15724 : Intrinsic::aarch64_neon_ushl;
15725 // negate the shift amount
15726 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
15727 Op.getOperand(1));
15728 SDValue NegShiftLeft =
15730 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
15731 NegShift);
15732 return NegShiftLeft;
15733 }
15734
15735 llvm_unreachable("unexpected shift opcode");
15736}
15737
15739 AArch64CC::CondCode CC, bool NoNans, EVT VT,
15740 const SDLoc &dl, SelectionDAG &DAG) {
15741 EVT SrcVT = LHS.getValueType();
15742 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
15743 "function only supposed to emit natural comparisons");
15744
15745 APInt SplatValue;
15746 APInt SplatUndef;
15747 unsigned SplatBitSize = 0;
15748 bool HasAnyUndefs;
15749
15750 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15751 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
15752 SplatBitSize, HasAnyUndefs);
15753
15754 bool IsZero = IsCnst && SplatValue == 0;
15755 bool IsOne =
15756 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
15757 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
15758
15759 if (SrcVT.getVectorElementType().isFloatingPoint()) {
15760 switch (CC) {
15761 default:
15762 return SDValue();
15763 case AArch64CC::NE: {
15764 SDValue Fcmeq;
15765 if (IsZero)
15766 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15767 else
15768 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15769 return DAG.getNOT(dl, Fcmeq, VT);
15770 }
15771 case AArch64CC::EQ:
15772 if (IsZero)
15773 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15774 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15775 case AArch64CC::GE:
15776 if (IsZero)
15777 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
15778 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
15779 case AArch64CC::GT:
15780 if (IsZero)
15781 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
15782 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15783 case AArch64CC::LE:
15784 if (!NoNans)
15785 return SDValue();
15786 // If we ignore NaNs then we can use to the LS implementation.
15787 [[fallthrough]];
15788 case AArch64CC::LS:
15789 if (IsZero)
15790 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
15791 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15792 case AArch64CC::LT:
15793 if (!NoNans)
15794 return SDValue();
15795 // If we ignore NaNs then we can use to the MI implementation.
15796 [[fallthrough]];
15797 case AArch64CC::MI:
15798 if (IsZero)
15799 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
15800 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15801 }
15802 }
15803
15804 switch (CC) {
15805 default:
15806 return SDValue();
15807 case AArch64CC::NE: {
15808 SDValue Cmeq;
15809 if (IsZero)
15810 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15811 else
15812 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15813 return DAG.getNOT(dl, Cmeq, VT);
15814 }
15815 case AArch64CC::EQ:
15816 if (IsZero)
15817 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15818 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15819 case AArch64CC::GE:
15820 if (IsZero)
15821 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15822 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
15823 case AArch64CC::GT:
15824 if (IsZero)
15825 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
15826 if (IsMinusOne)
15827 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15828 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
15829 case AArch64CC::LE:
15830 if (IsZero)
15831 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15832 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
15833 case AArch64CC::LS:
15834 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
15835 case AArch64CC::LO:
15836 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
15837 case AArch64CC::LT:
15838 if (IsZero)
15839 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
15840 if (IsOne)
15841 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15842 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
15843 case AArch64CC::HI:
15844 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
15845 case AArch64CC::HS:
15846 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
15847 }
15848}
15849
15850SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15851 SelectionDAG &DAG) const {
15852 if (Op.getValueType().isScalableVector())
15853 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
15854
15855 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15856 !Subtarget->isNeonAvailable()))
15857 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15858
15859 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15860 SDValue LHS = Op.getOperand(0);
15861 SDValue RHS = Op.getOperand(1);
15862 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15863 SDLoc dl(Op);
15864
15865 if (LHS.getValueType().getVectorElementType().isInteger()) {
15866 assert(LHS.getValueType() == RHS.getValueType());
15868 SDValue Cmp =
15869 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
15870 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15871 }
15872
15873 // Lower isnan(x) | isnan(never-nan) to x != x.
15874 // Lower !isnan(x) & !isnan(never-nan) to x == x.
15875 if (CC == ISD::SETUO || CC == ISD::SETO) {
15876 bool OneNaN = false;
15877 if (LHS == RHS) {
15878 OneNaN = true;
15879 } else if (DAG.isKnownNeverNaN(RHS)) {
15880 OneNaN = true;
15881 RHS = LHS;
15882 } else if (DAG.isKnownNeverNaN(LHS)) {
15883 OneNaN = true;
15884 LHS = RHS;
15885 }
15886 if (OneNaN) {
15888 }
15889 }
15890
15891 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15892
15893 // Make v4f16 (only) fcmp operations utilise vector instructions
15894 // v8f16 support will be a litle more complicated
15895 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
15896 LHS.getValueType().getVectorElementType() == MVT::bf16) {
15897 if (LHS.getValueType().getVectorNumElements() == 4) {
15898 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
15899 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
15900 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
15901 DAG.ReplaceAllUsesWith(Op, NewSetcc);
15902 CmpVT = MVT::v4i32;
15903 } else
15904 return SDValue();
15905 }
15906
15907 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
15908 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15909 LHS.getValueType().getVectorElementType() != MVT::f128);
15910
15911 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15912 // clean. Some of them require two branches to implement.
15913 AArch64CC::CondCode CC1, CC2;
15914 bool ShouldInvert;
15915 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
15916
15917 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15918 SDValue Cmp =
15919 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
15920 if (!Cmp.getNode())
15921 return SDValue();
15922
15923 if (CC2 != AArch64CC::AL) {
15924 SDValue Cmp2 =
15925 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
15926 if (!Cmp2.getNode())
15927 return SDValue();
15928
15929 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
15930 }
15931
15932 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15933
15934 if (ShouldInvert)
15935 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
15936
15937 return Cmp;
15938}
15939
15940static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15941 SelectionDAG &DAG) {
15942 SDValue VecOp = ScalarOp.getOperand(0);
15943 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
15944 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
15945 DAG.getConstant(0, DL, MVT::i64));
15946}
15947
15948static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15949 SDLoc DL, SelectionDAG &DAG) {
15950 unsigned ScalarOpcode;
15951 switch (Opcode) {
15952 case ISD::VECREDUCE_AND:
15953 ScalarOpcode = ISD::AND;
15954 break;
15955 case ISD::VECREDUCE_OR:
15956 ScalarOpcode = ISD::OR;
15957 break;
15958 case ISD::VECREDUCE_XOR:
15959 ScalarOpcode = ISD::XOR;
15960 break;
15961 default:
15962 llvm_unreachable("Expected bitwise vector reduction");
15963 return SDValue();
15964 }
15965
15966 EVT VecVT = Vec.getValueType();
15967 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15968 "Expected power-of-2 length vector");
15969
15970 EVT ElemVT = VecVT.getVectorElementType();
15971
15972 SDValue Result;
15973 unsigned NumElems = VecVT.getVectorNumElements();
15974
15975 // Special case for boolean reductions
15976 if (ElemVT == MVT::i1) {
15977 // Split large vectors into smaller ones
15978 if (NumElems > 16) {
15979 SDValue Lo, Hi;
15980 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15981 EVT HalfVT = Lo.getValueType();
15982 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
15983 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
15984 }
15985
15986 // Results of setcc operations get widened to 128 bits if their input
15987 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
15988 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
15989 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
15990 // size leads to the best codegen, since e.g. setcc results might need to be
15991 // truncated otherwise.
15992 unsigned ExtendedWidth = 64;
15993 if (Vec.getOpcode() == ISD::SETCC &&
15994 Vec.getOperand(0).getValueSizeInBits() >= 128) {
15995 ExtendedWidth = 128;
15996 }
15997 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
15998
15999 // any_ext doesn't work with umin/umax, so only use it for uadd.
16000 unsigned ExtendOp =
16001 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16002 SDValue Extended = DAG.getNode(
16003 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16004 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16005 // in that case we bitcast the sign extended values from v2i64 to v4i32
16006 // before reduction for optimal code generation.
16007 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16008 NumElems == 2 && ExtendedWidth == 128) {
16009 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16010 ExtendedVT = MVT::i32;
16011 }
16012 switch (ScalarOpcode) {
16013 case ISD::AND:
16014 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16015 break;
16016 case ISD::OR:
16017 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16018 break;
16019 case ISD::XOR:
16020 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16021 break;
16022 default:
16023 llvm_unreachable("Unexpected Opcode");
16024 }
16025
16026 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16027 } else {
16028 // Iteratively split the vector in half and combine using the bitwise
16029 // operation until it fits in a 64 bit register.
16030 while (VecVT.getSizeInBits() > 64) {
16031 SDValue Lo, Hi;
16032 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16033 VecVT = Lo.getValueType();
16034 NumElems = VecVT.getVectorNumElements();
16035 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16036 }
16037
16038 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16039
16040 // Do the remaining work on a scalar since it allows the code generator to
16041 // combine the shift and bitwise operation into one instruction and since
16042 // integer instructions can have higher throughput than vector instructions.
16043 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16044
16045 // Iteratively combine the lower and upper halves of the scalar using the
16046 // bitwise operation, halving the relevant region of the scalar in each
16047 // iteration, until the relevant region is just one element of the original
16048 // vector.
16049 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16050 SDValue ShiftAmount =
16051 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16052 SDValue Shifted =
16053 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16054 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16055 }
16056
16057 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16058 }
16059
16060 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16061}
16062
16063SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16064 SelectionDAG &DAG) const {
16065 SDValue Src = Op.getOperand(0);
16066
16067 // Try to lower fixed length reductions to SVE.
16068 EVT SrcVT = Src.getValueType();
16069 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16070 Op.getOpcode() == ISD::VECREDUCE_AND ||
16071 Op.getOpcode() == ISD::VECREDUCE_OR ||
16072 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16073 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16074 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16075 SrcVT.getVectorElementType() == MVT::i64);
16076 if (SrcVT.isScalableVector() ||
16078 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16079
16080 if (SrcVT.getVectorElementType() == MVT::i1)
16081 return LowerPredReductionToSVE(Op, DAG);
16082
16083 switch (Op.getOpcode()) {
16084 case ISD::VECREDUCE_ADD:
16085 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16086 case ISD::VECREDUCE_AND:
16087 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16088 case ISD::VECREDUCE_OR:
16089 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16091 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16093 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16095 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16097 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16098 case ISD::VECREDUCE_XOR:
16099 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16101 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16103 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16105 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16107 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16109 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16110 default:
16111 llvm_unreachable("Unhandled fixed length reduction");
16112 }
16113 }
16114
16115 // Lower NEON reductions.
16116 SDLoc dl(Op);
16117 switch (Op.getOpcode()) {
16118 case ISD::VECREDUCE_AND:
16119 case ISD::VECREDUCE_OR:
16120 case ISD::VECREDUCE_XOR:
16121 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16122 Op.getValueType(), dl, DAG);
16123 case ISD::VECREDUCE_ADD:
16124 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
16126 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
16128 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
16130 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
16132 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
16133 default:
16134 llvm_unreachable("Unhandled reduction");
16135 }
16136}
16137
16138SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16139 SelectionDAG &DAG) const {
16140 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16141 // No point replacing if we don't have the relevant instruction/libcall anyway
16142 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16143 return SDValue();
16144
16145 // LSE has an atomic load-clear instruction, but not a load-and.
16146 SDLoc dl(Op);
16147 MVT VT = Op.getSimpleValueType();
16148 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16149 SDValue RHS = Op.getOperand(2);
16150 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16151 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getAllOnesConstant(dl, VT), RHS);
16152 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
16153 Op.getOperand(0), Op.getOperand(1), RHS,
16154 AN->getMemOperand());
16155}
16156
16157SDValue
16158AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16159 SelectionDAG &DAG) const {
16160
16161 SDLoc dl(Op);
16162 // Get the inputs.
16163 SDNode *Node = Op.getNode();
16164 SDValue Chain = Op.getOperand(0);
16165 SDValue Size = Op.getOperand(1);
16167 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16168 EVT VT = Node->getValueType(0);
16169
16171 "no-stack-arg-probe")) {
16172 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16173 Chain = SP.getValue(1);
16174 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16175 if (Align)
16176 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16177 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16178 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16179 SDValue Ops[2] = {SP, Chain};
16180 return DAG.getMergeValues(Ops, dl);
16181 }
16182
16183 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
16184
16185 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16187 PtrVT, 0);
16188
16189 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16190 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16191 if (Subtarget->hasCustomCallingConv())
16192 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16193
16194 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
16195 DAG.getConstant(4, dl, MVT::i64));
16196 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
16197 Chain =
16198 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
16199 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16200 DAG.getRegisterMask(Mask), Chain.getValue(1));
16201 // To match the actual intent better, we should read the output from X15 here
16202 // again (instead of potentially spilling it to the stack), but rereading Size
16203 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16204 // here.
16205
16206 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
16207 DAG.getConstant(4, dl, MVT::i64));
16208
16209 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16210 Chain = SP.getValue(1);
16211 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16212 if (Align)
16213 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16214 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16215 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16216
16217 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
16218
16219 SDValue Ops[2] = {SP, Chain};
16220 return DAG.getMergeValues(Ops, dl);
16221}
16222
16223SDValue
16224AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16225 SelectionDAG &DAG) const {
16226 // Get the inputs.
16227 SDNode *Node = Op.getNode();
16228 SDValue Chain = Op.getOperand(0);
16229 SDValue Size = Op.getOperand(1);
16230
16232 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16233 SDLoc dl(Op);
16234 EVT VT = Node->getValueType(0);
16235
16236 // Construct the new SP value in a GPR.
16237 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16238 Chain = SP.getValue(1);
16239 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16240 if (Align)
16241 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16242 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16243
16244 // Set the real SP to the new value with a probing loop.
16245 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
16246 SDValue Ops[2] = {SP, Chain};
16247 return DAG.getMergeValues(Ops, dl);
16248}
16249
16250SDValue
16251AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16252 SelectionDAG &DAG) const {
16254
16255 if (Subtarget->isTargetWindows())
16256 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16257 else if (hasInlineStackProbe(MF))
16258 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16259 else
16260 return SDValue();
16261}
16262
16263SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16264 unsigned NewOp) const {
16265 if (Subtarget->hasSVE2())
16266 return LowerToPredicatedOp(Op, DAG, NewOp);
16267
16268 // Default to expand.
16269 return SDValue();
16270}
16271
16272SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16273 SelectionDAG &DAG) const {
16274 EVT VT = Op.getValueType();
16275 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16276
16277 SDLoc DL(Op);
16278 APInt MulImm = Op.getConstantOperandAPInt(0);
16279 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16280 VT);
16281}
16282
16283/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16284template <unsigned NumVecs>
16285static bool
16289 // Retrieve EC from first vector argument.
16290 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16292#ifndef NDEBUG
16293 // Check the assumption that all input vectors are the same type.
16294 for (unsigned I = 0; I < NumVecs; ++I)
16295 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16296 "Invalid type.");
16297#endif
16298 // memVT is `NumVecs * VT`.
16300 EC * NumVecs);
16301 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16302 Info.offset = 0;
16303 Info.align.reset();
16305 return true;
16306}
16307
16308/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16309/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16310/// specified in the intrinsic calls.
16312 const CallInst &I,
16313 MachineFunction &MF,
16314 unsigned Intrinsic) const {
16315 auto &DL = I.getDataLayout();
16316 switch (Intrinsic) {
16317 case Intrinsic::aarch64_sve_st2:
16318 return setInfoSVEStN<2>(*this, DL, Info, I);
16319 case Intrinsic::aarch64_sve_st3:
16320 return setInfoSVEStN<3>(*this, DL, Info, I);
16321 case Intrinsic::aarch64_sve_st4:
16322 return setInfoSVEStN<4>(*this, DL, Info, I);
16323 case Intrinsic::aarch64_neon_ld2:
16324 case Intrinsic::aarch64_neon_ld3:
16325 case Intrinsic::aarch64_neon_ld4:
16326 case Intrinsic::aarch64_neon_ld1x2:
16327 case Intrinsic::aarch64_neon_ld1x3:
16328 case Intrinsic::aarch64_neon_ld1x4: {
16330 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16331 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16332 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16333 Info.offset = 0;
16334 Info.align.reset();
16335 // volatile loads with NEON intrinsics not supported
16337 return true;
16338 }
16339 case Intrinsic::aarch64_neon_ld2lane:
16340 case Intrinsic::aarch64_neon_ld3lane:
16341 case Intrinsic::aarch64_neon_ld4lane:
16342 case Intrinsic::aarch64_neon_ld2r:
16343 case Intrinsic::aarch64_neon_ld3r:
16344 case Intrinsic::aarch64_neon_ld4r: {
16346 // ldx return struct with the same vec type
16347 Type *RetTy = I.getType();
16348 auto *StructTy = cast<StructType>(RetTy);
16349 unsigned NumElts = StructTy->getNumElements();
16350 Type *VecTy = StructTy->getElementType(0);
16351 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16352 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16353 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16354 Info.offset = 0;
16355 Info.align.reset();
16356 // volatile loads with NEON intrinsics not supported
16358 return true;
16359 }
16360 case Intrinsic::aarch64_neon_st2:
16361 case Intrinsic::aarch64_neon_st3:
16362 case Intrinsic::aarch64_neon_st4:
16363 case Intrinsic::aarch64_neon_st1x2:
16364 case Intrinsic::aarch64_neon_st1x3:
16365 case Intrinsic::aarch64_neon_st1x4: {
16367 unsigned NumElts = 0;
16368 for (const Value *Arg : I.args()) {
16369 Type *ArgTy = Arg->getType();
16370 if (!ArgTy->isVectorTy())
16371 break;
16372 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16373 }
16374 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16375 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16376 Info.offset = 0;
16377 Info.align.reset();
16378 // volatile stores with NEON intrinsics not supported
16380 return true;
16381 }
16382 case Intrinsic::aarch64_neon_st2lane:
16383 case Intrinsic::aarch64_neon_st3lane:
16384 case Intrinsic::aarch64_neon_st4lane: {
16386 unsigned NumElts = 0;
16387 // all the vector type is same
16388 Type *VecTy = I.getArgOperand(0)->getType();
16389 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16390
16391 for (const Value *Arg : I.args()) {
16392 Type *ArgTy = Arg->getType();
16393 if (!ArgTy->isVectorTy())
16394 break;
16395 NumElts += 1;
16396 }
16397
16398 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16399 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16400 Info.offset = 0;
16401 Info.align.reset();
16402 // volatile stores with NEON intrinsics not supported
16404 return true;
16405 }
16406 case Intrinsic::aarch64_ldaxr:
16407 case Intrinsic::aarch64_ldxr: {
16408 Type *ValTy = I.getParamElementType(0);
16410 Info.memVT = MVT::getVT(ValTy);
16411 Info.ptrVal = I.getArgOperand(0);
16412 Info.offset = 0;
16413 Info.align = DL.getABITypeAlign(ValTy);
16415 return true;
16416 }
16417 case Intrinsic::aarch64_stlxr:
16418 case Intrinsic::aarch64_stxr: {
16419 Type *ValTy = I.getParamElementType(1);
16421 Info.memVT = MVT::getVT(ValTy);
16422 Info.ptrVal = I.getArgOperand(1);
16423 Info.offset = 0;
16424 Info.align = DL.getABITypeAlign(ValTy);
16426 return true;
16427 }
16428 case Intrinsic::aarch64_ldaxp:
16429 case Intrinsic::aarch64_ldxp:
16431 Info.memVT = MVT::i128;
16432 Info.ptrVal = I.getArgOperand(0);
16433 Info.offset = 0;
16434 Info.align = Align(16);
16436 return true;
16437 case Intrinsic::aarch64_stlxp:
16438 case Intrinsic::aarch64_stxp:
16440 Info.memVT = MVT::i128;
16441 Info.ptrVal = I.getArgOperand(2);
16442 Info.offset = 0;
16443 Info.align = Align(16);
16445 return true;
16446 case Intrinsic::aarch64_sve_ldnt1: {
16447 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
16449 Info.memVT = MVT::getVT(I.getType());
16450 Info.ptrVal = I.getArgOperand(1);
16451 Info.offset = 0;
16452 Info.align = DL.getABITypeAlign(ElTy);
16454 return true;
16455 }
16456 case Intrinsic::aarch64_sve_stnt1: {
16457 Type *ElTy =
16458 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
16460 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
16461 Info.ptrVal = I.getArgOperand(2);
16462 Info.offset = 0;
16463 Info.align = DL.getABITypeAlign(ElTy);
16465 return true;
16466 }
16467 case Intrinsic::aarch64_mops_memset_tag: {
16468 Value *Dst = I.getArgOperand(0);
16469 Value *Val = I.getArgOperand(1);
16471 Info.memVT = MVT::getVT(Val->getType());
16472 Info.ptrVal = Dst;
16473 Info.offset = 0;
16474 Info.align = I.getParamAlign(0).valueOrOne();
16476 // The size of the memory being operated on is unknown at this point
16478 return true;
16479 }
16480 default:
16481 break;
16482 }
16483
16484 return false;
16485}
16486
16488 ISD::LoadExtType ExtTy,
16489 EVT NewVT) const {
16490 // TODO: This may be worth removing. Check regression tests for diffs.
16491 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
16492 return false;
16493
16494 // If we're reducing the load width in order to avoid having to use an extra
16495 // instruction to do extension then it's probably a good idea.
16496 if (ExtTy != ISD::NON_EXTLOAD)
16497 return true;
16498 // Don't reduce load width if it would prevent us from combining a shift into
16499 // the offset.
16500 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
16501 assert(Mem);
16502 const SDValue &Base = Mem->getBasePtr();
16503 if (Base.getOpcode() == ISD::ADD &&
16504 Base.getOperand(1).getOpcode() == ISD::SHL &&
16505 Base.getOperand(1).hasOneUse() &&
16506 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
16507 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16508 if (Mem->getMemoryVT().isScalableVector())
16509 return false;
16510 // The shift can be combined if it matches the size of the value being
16511 // loaded (and so reducing the width would make it not match).
16512 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
16513 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16514 if (ShiftAmount == Log2_32(LoadBytes))
16515 return false;
16516 }
16517 // We have no reason to disallow reducing the load width, so allow it.
16518 return true;
16519}
16520
16521// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16523 EVT VT = Extend.getValueType();
16524 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16525 SDValue Extract = Extend.getOperand(0);
16526 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16527 Extract = Extract.getOperand(0);
16528 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16529 EVT VecVT = Extract.getOperand(0).getValueType();
16530 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16531 return false;
16532 }
16533 }
16534 return true;
16535}
16536
16537// Truncations from 64-bit GPR to 32-bit GPR is free.
16539 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16540 return false;
16541 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16542 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16543 return NumBits1 > NumBits2;
16544}
16546 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16547 return false;
16548 uint64_t NumBits1 = VT1.getFixedSizeInBits();
16549 uint64_t NumBits2 = VT2.getFixedSizeInBits();
16550 return NumBits1 > NumBits2;
16551}
16552
16553/// Check if it is profitable to hoist instruction in then/else to if.
16554/// Not profitable if I and it's user can form a FMA instruction
16555/// because we prefer FMSUB/FMADD.
16557 if (I->getOpcode() != Instruction::FMul)
16558 return true;
16559
16560 if (!I->hasOneUse())
16561 return true;
16562
16563 Instruction *User = I->user_back();
16564
16565 if (!(User->getOpcode() == Instruction::FSub ||
16566 User->getOpcode() == Instruction::FAdd))
16567 return true;
16568
16570 const Function *F = I->getFunction();
16571 const DataLayout &DL = F->getDataLayout();
16572 Type *Ty = User->getOperand(0)->getType();
16573
16574 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
16576 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16577 Options.UnsafeFPMath));
16578}
16579
16580// All 32-bit GPR operations implicitly zero the high-half of the corresponding
16581// 64-bit GPR.
16583 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16584 return false;
16585 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16586 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16587 return NumBits1 == 32 && NumBits2 == 64;
16588}
16590 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16591 return false;
16592 unsigned NumBits1 = VT1.getSizeInBits();
16593 unsigned NumBits2 = VT2.getSizeInBits();
16594 return NumBits1 == 32 && NumBits2 == 64;
16595}
16596
16598 EVT VT1 = Val.getValueType();
16599 if (isZExtFree(VT1, VT2)) {
16600 return true;
16601 }
16602
16603 if (Val.getOpcode() != ISD::LOAD)
16604 return false;
16605
16606 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16607 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16608 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16609 VT1.getSizeInBits() <= 32);
16610}
16611
16612bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
16613 if (isa<FPExtInst>(Ext))
16614 return false;
16615
16616 // Vector types are not free.
16617 if (Ext->getType()->isVectorTy())
16618 return false;
16619
16620 for (const Use &U : Ext->uses()) {
16621 // The extension is free if we can fold it with a left shift in an
16622 // addressing mode or an arithmetic operation: add, sub, and cmp.
16623
16624 // Is there a shift?
16625 const Instruction *Instr = cast<Instruction>(U.getUser());
16626
16627 // Is this a constant shift?
16628 switch (Instr->getOpcode()) {
16629 case Instruction::Shl:
16630 if (!isa<ConstantInt>(Instr->getOperand(1)))
16631 return false;
16632 break;
16633 case Instruction::GetElementPtr: {
16634 gep_type_iterator GTI = gep_type_begin(Instr);
16635 auto &DL = Ext->getDataLayout();
16636 std::advance(GTI, U.getOperandNo()-1);
16637 Type *IdxTy = GTI.getIndexedType();
16638 // This extension will end up with a shift because of the scaling factor.
16639 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16640 // Get the shift amount based on the scaling factor:
16641 // log2(sizeof(IdxTy)) - log2(8).
16642 if (IdxTy->isScalableTy())
16643 return false;
16644 uint64_t ShiftAmt =
16645 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
16646 3;
16647 // Is the constant foldable in the shift of the addressing mode?
16648 // I.e., shift amount is between 1 and 4 inclusive.
16649 if (ShiftAmt == 0 || ShiftAmt > 4)
16650 return false;
16651 break;
16652 }
16653 case Instruction::Trunc:
16654 // Check if this is a noop.
16655 // trunc(sext ty1 to ty2) to ty1.
16656 if (Instr->getType() == Ext->getOperand(0)->getType())
16657 continue;
16658 [[fallthrough]];
16659 default:
16660 return false;
16661 }
16662
16663 // At this point we can use the bfm family, so this extension is free
16664 // for that use.
16665 }
16666 return true;
16667}
16668
16669static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16670 unsigned NumElts, bool IsLittleEndian,
16671 SmallVectorImpl<int> &Mask) {
16672 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
16673 return false;
16674
16675 assert(DstWidth % SrcWidth == 0 &&
16676 "TBL lowering is not supported for a conversion instruction with this "
16677 "source and destination element type.");
16678
16679 unsigned Factor = DstWidth / SrcWidth;
16680 unsigned MaskLen = NumElts * Factor;
16681
16682 Mask.clear();
16683 Mask.resize(MaskLen, NumElts);
16684
16685 unsigned SrcIndex = 0;
16686 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16687 Mask[I] = SrcIndex++;
16688
16689 return true;
16690}
16691
16693 FixedVectorType *ZExtTy,
16694 FixedVectorType *DstTy,
16695 bool IsLittleEndian) {
16696 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16697 unsigned NumElts = SrcTy->getNumElements();
16698 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16699 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16700
16701 SmallVector<int> Mask;
16702 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16703 return nullptr;
16704
16705 auto *FirstEltZero = Builder.CreateInsertElement(
16706 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16707 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16708 Result = Builder.CreateBitCast(Result, DstTy);
16709 if (DstTy != ZExtTy)
16710 Result = Builder.CreateZExt(Result, ZExtTy);
16711 return Result;
16712}
16713
16715 FixedVectorType *DstTy,
16716 bool IsLittleEndian) {
16717 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16718 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16719 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16720
16721 SmallVector<int> Mask;
16722 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
16723 !IsLittleEndian, Mask))
16724 return nullptr;
16725
16726 auto *FirstEltZero = Builder.CreateInsertElement(
16727 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16728
16729 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16730}
16731
16732static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16733 IRBuilder<> Builder(TI);
16735 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16736 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16737 auto *DstTy = cast<FixedVectorType>(TI->getType());
16738 assert(SrcTy->getElementType()->isIntegerTy() &&
16739 "Non-integer type source vector element is not supported");
16740 assert(DstTy->getElementType()->isIntegerTy(8) &&
16741 "Unsupported destination vector element type");
16742 unsigned SrcElemTySz =
16743 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16744 unsigned DstElemTySz =
16745 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16746 assert((SrcElemTySz % DstElemTySz == 0) &&
16747 "Cannot lower truncate to tbl instructions for a source element size "
16748 "that is not divisible by the destination element size");
16749 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16750 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16751 "Unsupported source vector element type size");
16752 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16753
16754 // Create a mask to choose every nth byte from the source vector table of
16755 // bytes to create the truncated destination vector, where 'n' is the truncate
16756 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16757 // 0,8,16,..Y*8th bytes for the little-endian format
16759 for (int Itr = 0; Itr < 16; Itr++) {
16760 if (Itr < NumElements)
16761 MaskConst.push_back(Builder.getInt8(
16762 IsLittleEndian ? Itr * TruncFactor
16763 : Itr * TruncFactor + (TruncFactor - 1)));
16764 else
16765 MaskConst.push_back(Builder.getInt8(255));
16766 }
16767
16768 int MaxTblSz = 128 * 4;
16769 int MaxSrcSz = SrcElemTySz * NumElements;
16770 int ElemsPerTbl =
16771 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16772 assert(ElemsPerTbl <= 16 &&
16773 "Maximum elements selected using TBL instruction cannot exceed 16!");
16774
16775 int ShuffleCount = 128 / SrcElemTySz;
16776 SmallVector<int> ShuffleLanes;
16777 for (int i = 0; i < ShuffleCount; ++i)
16778 ShuffleLanes.push_back(i);
16779
16780 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16781 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16782 // call TBL & save the result in a vector of TBL results for combining later.
16784 while (ShuffleLanes.back() < NumElements) {
16785 Parts.push_back(Builder.CreateBitCast(
16786 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16787
16788 if (Parts.size() == 4) {
16789 Parts.push_back(ConstantVector::get(MaskConst));
16790 Results.push_back(
16791 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
16792 Parts.clear();
16793 }
16794
16795 for (int i = 0; i < ShuffleCount; ++i)
16796 ShuffleLanes[i] += ShuffleCount;
16797 }
16798
16799 assert((Parts.empty() || Results.empty()) &&
16800 "Lowering trunc for vectors requiring different TBL instructions is "
16801 "not supported!");
16802 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16803 // registers
16804 if (!Parts.empty()) {
16805 Intrinsic::ID TblID;
16806 switch (Parts.size()) {
16807 case 1:
16808 TblID = Intrinsic::aarch64_neon_tbl1;
16809 break;
16810 case 2:
16811 TblID = Intrinsic::aarch64_neon_tbl2;
16812 break;
16813 case 3:
16814 TblID = Intrinsic::aarch64_neon_tbl3;
16815 break;
16816 }
16817
16818 Parts.push_back(ConstantVector::get(MaskConst));
16819 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
16820 }
16821
16822 // Extract the destination vector from TBL result(s) after combining them
16823 // where applicable. Currently, at most two TBLs are supported.
16824 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16825 "more than 2 tbl instructions!");
16826 Value *FinalResult = Results[0];
16827 if (Results.size() == 1) {
16828 if (ElemsPerTbl < 16) {
16829 SmallVector<int> FinalMask(ElemsPerTbl);
16830 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16831 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16832 }
16833 } else {
16834 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16835 if (ElemsPerTbl < 16) {
16836 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16837 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16838 } else {
16839 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16840 }
16841 FinalResult =
16842 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16843 }
16844
16845 TI->replaceAllUsesWith(FinalResult);
16846 TI->eraseFromParent();
16847}
16848
16850 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16851 // shuffle_vector instructions are serialized when targeting SVE,
16852 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16853 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16854 return false;
16855
16856 // Try to optimize conversions using tbl. This requires materializing constant
16857 // index vectors, which can increase code size and add loads. Skip the
16858 // transform unless the conversion is in a loop block guaranteed to execute
16859 // and we are not optimizing for size.
16860 Function *F = I->getParent()->getParent();
16861 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16862 F->hasOptSize())
16863 return false;
16864
16865 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16866 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16867 if (!SrcTy || !DstTy)
16868 return false;
16869
16870 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16871 // lowered to tbl instructions to insert the original i8 elements
16872 // into i8x lanes. This is enabled for cases where it is beneficial.
16873 auto *ZExt = dyn_cast<ZExtInst>(I);
16874 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16875 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16876 if (DstWidth % 8 != 0)
16877 return false;
16878
16879 auto *TruncDstType =
16880 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16881 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16882 // the remaining ZExt folded into the user, don't use tbl lowering.
16883 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16884 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16887 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16888 return false;
16889
16890 DstTy = TruncDstType;
16891 }
16892
16893 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
16894 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
16895 // most one extra extend step is needed and using tbl is not profitable.
16896 if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
16897 auto *SingleUser = cast<Instruction>(*I->user_begin());
16898 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
16899 return false;
16900 }
16901
16902 if (DstTy->getScalarSizeInBits() >= 64)
16903 return false;
16904
16905 IRBuilder<> Builder(ZExt);
16907 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16908 DstTy, Subtarget->isLittleEndian());
16909 if (!Result)
16910 return false;
16911 ZExt->replaceAllUsesWith(Result);
16912 ZExt->eraseFromParent();
16913 return true;
16914 }
16915
16916 auto *UIToFP = dyn_cast<UIToFPInst>(I);
16917 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
16918 DstTy->getElementType()->isFloatTy()) ||
16919 (SrcTy->getElementType()->isIntegerTy(16) &&
16920 DstTy->getElementType()->isDoubleTy()))) {
16921 IRBuilder<> Builder(I);
16923 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16924 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16925 assert(ZExt && "Cannot fail for the i8 to float conversion");
16926 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16927 I->replaceAllUsesWith(UI);
16928 I->eraseFromParent();
16929 return true;
16930 }
16931
16932 auto *SIToFP = dyn_cast<SIToFPInst>(I);
16933 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16934 DstTy->getElementType()->isFloatTy()) {
16935 IRBuilder<> Builder(I);
16936 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
16938 Subtarget->isLittleEndian());
16939 assert(Shuffle && "Cannot fail for the i8 to float conversion");
16940 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
16941 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
16942 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
16943 I->replaceAllUsesWith(SI);
16944 I->eraseFromParent();
16945 return true;
16946 }
16947
16948 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16949 // followed by a truncate lowered to using tbl.4.
16950 auto *FPToUI = dyn_cast<FPToUIInst>(I);
16951 if (FPToUI &&
16952 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16953 SrcTy->getElementType()->isFloatTy() &&
16954 DstTy->getElementType()->isIntegerTy(8)) {
16955 IRBuilder<> Builder(I);
16956 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16957 VectorType::getInteger(SrcTy));
16958 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16959 I->replaceAllUsesWith(TruncI);
16960 I->eraseFromParent();
16961 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16962 return true;
16963 }
16964
16965 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16966 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16967 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16968 // registers
16969 auto *TI = dyn_cast<TruncInst>(I);
16970 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
16971 ((SrcTy->getElementType()->isIntegerTy(32) ||
16972 SrcTy->getElementType()->isIntegerTy(64)) &&
16973 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16974 createTblForTrunc(TI, Subtarget->isLittleEndian());
16975 return true;
16976 }
16977
16978 return false;
16979}
16980
16982 Align &RequiredAligment) const {
16983 if (!LoadedType.isSimple() ||
16984 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16985 return false;
16986 // Cyclone supports unaligned accesses.
16987 RequiredAligment = Align(1);
16988 unsigned NumBits = LoadedType.getSizeInBits();
16989 return NumBits == 32 || NumBits == 64;
16990}
16991
16992/// A helper function for determining the number of interleaved accesses we
16993/// will generate when lowering accesses of the given type.
16995 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16996 unsigned VecSize = 128;
16997 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16998 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16999 if (UseScalable && isa<FixedVectorType>(VecTy))
17000 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17001 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17002}
17003
17006 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17007 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17008 return MOStridedAccess;
17010}
17011
17013 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17014 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17015 auto EC = VecTy->getElementCount();
17016 unsigned MinElts = EC.getKnownMinValue();
17017
17018 UseScalable = false;
17019
17020 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17021 (!Subtarget->useSVEForFixedLengthVectors() ||
17023 return false;
17024
17025 if (isa<ScalableVectorType>(VecTy) &&
17026 !Subtarget->isSVEorStreamingSVEAvailable())
17027 return false;
17028
17029 // Ensure the number of vector elements is greater than 1.
17030 if (MinElts < 2)
17031 return false;
17032
17033 // Ensure the element type is legal.
17034 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17035 return false;
17036
17037 if (EC.isScalable()) {
17038 UseScalable = true;
17039 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17040 }
17041
17042 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17043 if (Subtarget->useSVEForFixedLengthVectors()) {
17044 unsigned MinSVEVectorSize =
17045 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17046 if (VecSize % MinSVEVectorSize == 0 ||
17047 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17048 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17049 UseScalable = true;
17050 return true;
17051 }
17052 }
17053
17054 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17055 // 128 will be split into multiple interleaved accesses.
17056 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17057}
17058
17060 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17061 return ScalableVectorType::get(VTy->getElementType(), 2);
17062
17063 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17064 return ScalableVectorType::get(VTy->getElementType(), 4);
17065
17066 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17067 return ScalableVectorType::get(VTy->getElementType(), 8);
17068
17069 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17070 return ScalableVectorType::get(VTy->getElementType(), 8);
17071
17072 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17073 return ScalableVectorType::get(VTy->getElementType(), 2);
17074
17075 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17076 return ScalableVectorType::get(VTy->getElementType(), 4);
17077
17078 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17079 return ScalableVectorType::get(VTy->getElementType(), 8);
17080
17081 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17082 return ScalableVectorType::get(VTy->getElementType(), 16);
17083
17084 llvm_unreachable("Cannot handle input vector type");
17085}
17086
17087static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17088 bool Scalable, Type *LDVTy,
17089 Type *PtrTy) {
17090 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17091 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17092 Intrinsic::aarch64_sve_ld3_sret,
17093 Intrinsic::aarch64_sve_ld4_sret};
17094 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17095 Intrinsic::aarch64_neon_ld3,
17096 Intrinsic::aarch64_neon_ld4};
17097 if (Scalable)
17098 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17099
17100 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17101 {LDVTy, PtrTy});
17102}
17103
17104static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17105 bool Scalable, Type *STVTy,
17106 Type *PtrTy) {
17107 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17108 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17109 Intrinsic::aarch64_sve_st3,
17110 Intrinsic::aarch64_sve_st4};
17111 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17112 Intrinsic::aarch64_neon_st3,
17113 Intrinsic::aarch64_neon_st4};
17114 if (Scalable)
17115 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17116
17117 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17118 {STVTy, PtrTy});
17119}
17120
17121/// Lower an interleaved load into a ldN intrinsic.
17122///
17123/// E.g. Lower an interleaved load (Factor = 2):
17124/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17125/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17126/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17127///
17128/// Into:
17129/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17130/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17131/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17134 ArrayRef<unsigned> Indices, unsigned Factor) const {
17135 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17136 "Invalid interleave factor");
17137 assert(!Shuffles.empty() && "Empty shufflevector input");
17138 assert(Shuffles.size() == Indices.size() &&
17139 "Unmatched number of shufflevectors and indices");
17140
17141 const DataLayout &DL = LI->getDataLayout();
17142
17143 VectorType *VTy = Shuffles[0]->getType();
17144
17145 // Skip if we do not have NEON and skip illegal vector types. We can
17146 // "legalize" wide vector types into multiple interleaved accesses as long as
17147 // the vector types are divisible by 128.
17148 bool UseScalable;
17149 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17150 return false;
17151
17152 // Check if the interleave is a zext(shuffle), that can be better optimized
17153 // into shift / and masks. For the moment we do this just for uitofp (not
17154 // zext) to avoid issues with widening instructions.
17155 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17156 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17157 SI->getType()->getScalarSizeInBits() * 4 ==
17158 SI->user_back()->getType()->getScalarSizeInBits();
17159 }))
17160 return false;
17161
17162 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17163
17164 auto *FVTy = cast<FixedVectorType>(VTy);
17165
17166 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17167 // load integer vectors first and then convert to pointer vectors.
17168 Type *EltTy = FVTy->getElementType();
17169 if (EltTy->isPointerTy())
17170 FVTy =
17171 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17172
17173 // If we're going to generate more than one load, reset the sub-vector type
17174 // to something legal.
17175 FVTy = FixedVectorType::get(FVTy->getElementType(),
17176 FVTy->getNumElements() / NumLoads);
17177
17178 auto *LDVTy =
17179 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17180
17181 IRBuilder<> Builder(LI);
17182
17183 // The base address of the load.
17184 Value *BaseAddr = LI->getPointerOperand();
17185
17186 Type *PtrTy = LI->getPointerOperandType();
17187 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17188 LDVTy->getElementCount());
17189
17190 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17191 UseScalable, LDVTy, PtrTy);
17192
17193 // Holds sub-vectors extracted from the load intrinsic return values. The
17194 // sub-vectors are associated with the shufflevector instructions they will
17195 // replace.
17197
17198 Value *PTrue = nullptr;
17199 if (UseScalable) {
17200 std::optional<unsigned> PgPattern =
17201 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17202 if (Subtarget->getMinSVEVectorSizeInBits() ==
17203 Subtarget->getMaxSVEVectorSizeInBits() &&
17204 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17205 PgPattern = AArch64SVEPredPattern::all;
17206
17207 auto *PTruePat =
17208 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17209 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17210 {PTruePat});
17211 }
17212
17213 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17214
17215 // If we're generating more than one load, compute the base address of
17216 // subsequent loads as an offset from the previous.
17217 if (LoadCount > 0)
17218 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17219 FVTy->getNumElements() * Factor);
17220
17221 CallInst *LdN;
17222 if (UseScalable)
17223 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17224 else
17225 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17226
17227 // Extract and store the sub-vectors returned by the load intrinsic.
17228 for (unsigned i = 0; i < Shuffles.size(); i++) {
17229 ShuffleVectorInst *SVI = Shuffles[i];
17230 unsigned Index = Indices[i];
17231
17232 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17233
17234 if (UseScalable)
17235 SubVec = Builder.CreateExtractVector(
17236 FVTy, SubVec,
17237 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
17238
17239 // Convert the integer vector to pointer vector if the element is pointer.
17240 if (EltTy->isPointerTy())
17241 SubVec = Builder.CreateIntToPtr(
17243 FVTy->getNumElements()));
17244
17245 SubVecs[SVI].push_back(SubVec);
17246 }
17247 }
17248
17249 // Replace uses of the shufflevector instructions with the sub-vectors
17250 // returned by the load intrinsic. If a shufflevector instruction is
17251 // associated with more than one sub-vector, those sub-vectors will be
17252 // concatenated into a single wide vector.
17253 for (ShuffleVectorInst *SVI : Shuffles) {
17254 auto &SubVec = SubVecs[SVI];
17255 auto *WideVec =
17256 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17257 SVI->replaceAllUsesWith(WideVec);
17258 }
17259
17260 return true;
17261}
17262
17263template <typename Iter>
17264bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17265 int MaxLookupDist = 20;
17266 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17267 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17268 const Value *PtrA1 =
17269 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17270
17271 while (++It != End) {
17272 if (It->isDebugOrPseudoInst())
17273 continue;
17274 if (MaxLookupDist-- == 0)
17275 break;
17276 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17277 const Value *PtrB1 =
17278 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17279 DL, OffsetB);
17280 if (PtrA1 == PtrB1 &&
17281 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17282 .abs() == 16)
17283 return true;
17284 }
17285 }
17286
17287 return false;
17288}
17289
17290/// Lower an interleaved store into a stN intrinsic.
17291///
17292/// E.g. Lower an interleaved store (Factor = 3):
17293/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17294/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17295/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17296///
17297/// Into:
17298/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17299/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17300/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17301/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17302///
17303/// Note that the new shufflevectors will be removed and we'll only generate one
17304/// st3 instruction in CodeGen.
17305///
17306/// Example for a more general valid mask (Factor 3). Lower:
17307/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17308/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17309/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17310///
17311/// Into:
17312/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17313/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17314/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17315/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17317 ShuffleVectorInst *SVI,
17318 unsigned Factor) const {
17319
17320 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17321 "Invalid interleave factor");
17322
17323 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17324 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17325
17326 unsigned LaneLen = VecTy->getNumElements() / Factor;
17327 Type *EltTy = VecTy->getElementType();
17328 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17329
17330 const DataLayout &DL = SI->getDataLayout();
17331 bool UseScalable;
17332
17333 // Skip if we do not have NEON and skip illegal vector types. We can
17334 // "legalize" wide vector types into multiple interleaved accesses as long as
17335 // the vector types are divisible by 128.
17336 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17337 return false;
17338
17339 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17340
17341 Value *Op0 = SVI->getOperand(0);
17342 Value *Op1 = SVI->getOperand(1);
17343 IRBuilder<> Builder(SI);
17344
17345 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17346 // vectors to integer vectors.
17347 if (EltTy->isPointerTy()) {
17348 Type *IntTy = DL.getIntPtrType(EltTy);
17349 unsigned NumOpElts =
17350 cast<FixedVectorType>(Op0->getType())->getNumElements();
17351
17352 // Convert to the corresponding integer vector.
17353 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17354 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17355 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17356
17357 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17358 }
17359
17360 // If we're going to generate more than one store, reset the lane length
17361 // and sub-vector type to something legal.
17362 LaneLen /= NumStores;
17363 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17364
17365 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17366 : SubVecTy;
17367
17368 // The base address of the store.
17369 Value *BaseAddr = SI->getPointerOperand();
17370
17371 auto Mask = SVI->getShuffleMask();
17372
17373 // Sanity check if all the indices are NOT in range.
17374 // If mask is `poison`, `Mask` may be a vector of -1s.
17375 // If all of them are `poison`, OOB read will happen later.
17376 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17377 return false;
17378 }
17379 // A 64bit st2 which does not start at element 0 will involved adding extra
17380 // ext elements making the st2 unprofitable, and if there is a nearby store
17381 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17382 // zip;ldp pair which has higher throughput.
17383 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17384 (Mask[0] != 0 ||
17385 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17386 DL) ||
17387 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17388 BaseAddr, DL)))
17389 return false;
17390
17391 Type *PtrTy = SI->getPointerOperandType();
17392 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17393 STVTy->getElementCount());
17394
17395 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17396 UseScalable, STVTy, PtrTy);
17397
17398 Value *PTrue = nullptr;
17399 if (UseScalable) {
17400 std::optional<unsigned> PgPattern =
17401 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17402 if (Subtarget->getMinSVEVectorSizeInBits() ==
17403 Subtarget->getMaxSVEVectorSizeInBits() &&
17404 Subtarget->getMinSVEVectorSizeInBits() ==
17405 DL.getTypeSizeInBits(SubVecTy))
17406 PgPattern = AArch64SVEPredPattern::all;
17407
17408 auto *PTruePat =
17409 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
17410 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17411 {PTruePat});
17412 }
17413
17414 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17415
17417
17418 // Split the shufflevector operands into sub vectors for the new stN call.
17419 for (unsigned i = 0; i < Factor; i++) {
17420 Value *Shuffle;
17421 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17422 if (Mask[IdxI] >= 0) {
17423 Shuffle = Builder.CreateShuffleVector(
17424 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17425 } else {
17426 unsigned StartMask = 0;
17427 for (unsigned j = 1; j < LaneLen; j++) {
17428 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17429 if (Mask[IdxJ] >= 0) {
17430 StartMask = Mask[IdxJ] - j;
17431 break;
17432 }
17433 }
17434 // Note: Filling undef gaps with random elements is ok, since
17435 // those elements were being written anyway (with undefs).
17436 // In the case of all undefs we're defaulting to using elems from 0
17437 // Note: StartMask cannot be negative, it's checked in
17438 // isReInterleaveMask
17439 Shuffle = Builder.CreateShuffleVector(
17440 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17441 }
17442
17443 if (UseScalable)
17444 Shuffle = Builder.CreateInsertVector(
17445 STVTy, UndefValue::get(STVTy), Shuffle,
17446 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
17447
17448 Ops.push_back(Shuffle);
17449 }
17450
17451 if (UseScalable)
17452 Ops.push_back(PTrue);
17453
17454 // If we generating more than one store, we compute the base address of
17455 // subsequent stores as an offset from the previous.
17456 if (StoreCount > 0)
17457 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17458 BaseAddr, LaneLen * Factor);
17459
17460 Ops.push_back(BaseAddr);
17461 Builder.CreateCall(StNFunc, Ops);
17462 }
17463 return true;
17464}
17465
17467 Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
17468 SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
17469 if (!DI->hasNUses(2))
17470 return false;
17471 auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
17472 auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
17473 if (!Extr1 || !Extr2)
17474 return false;
17475
17476 DeinterleavedValues.resize(2);
17477 // Place the values into the vector in the order of extraction:
17478 DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1;
17479 DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2;
17480 if (!DeinterleavedValues[0] || !DeinterleavedValues[1])
17481 return false;
17482
17483 // Make sure that the extracted values match the deinterleave tree pattern
17484 if (!match(DeinterleavedValues[0], m_ExtractValue<0>((m_Specific(DI)))) ||
17485 !match(DeinterleavedValues[1], m_ExtractValue<1>((m_Specific(DI))))) {
17486 LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
17487 return false;
17488 }
17489 // DeinterleavedValues will be replace by output of ld2
17490 DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
17491 DeinterleavedValues.begin(),
17492 DeinterleavedValues.end());
17493 return true;
17494}
17495
17496/*
17497DeinterleaveIntrinsic tree:
17498 [DI]
17499 / \
17500 [Extr<0>] [Extr<1>]
17501 | |
17502 [DI] [DI]
17503 / \ / \
17504 [Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
17505 | | | |
17506roots: A C B D
17507roots in correct order of DI4 will be: A B C D.
17508Returns true if `DI` is the top of an IR tree that represents a theoretical
17509vector.deinterleave4 intrinsic. When true is returned, \p `DeinterleavedValues`
17510vector is populated with the results such an intrinsic would return: (i.e. {A,
17511B, C, D } = vector.deinterleave4(...))
17512*/
17514 Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
17515 SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
17516 if (!DI->hasNUses(2))
17517 return false;
17518 auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
17519 auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
17520 if (!Extr1 || !Extr2)
17521 return false;
17522
17523 if (!Extr1->hasOneUse() || !Extr2->hasOneUse())
17524 return false;
17525 auto *DI1 = *(Extr1->user_begin());
17526 auto *DI2 = *(Extr2->user_begin());
17527
17528 if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
17529 return false;
17530 // Leaf nodes of the deinterleave tree:
17531 auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
17532 auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
17533 auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
17534 auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
17535 // Make sure that the A,B,C and D are ExtractValue instructions before getting
17536 // the extract index
17537 if (!A || !B || !C || !D)
17538 return false;
17539
17540 DeinterleavedValues.resize(4);
17541 // Place the values into the vector in the order of deinterleave4:
17542 DeinterleavedValues[0x3 &
17543 ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A;
17544 DeinterleavedValues[0x3 &
17545 ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B;
17546 DeinterleavedValues[0x3 &
17547 ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C;
17548 DeinterleavedValues[0x3 &
17549 ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D;
17550 if (!DeinterleavedValues[0] || !DeinterleavedValues[1] ||
17551 !DeinterleavedValues[2] || !DeinterleavedValues[3])
17552 return false;
17553
17554 // Make sure that A,B,C,D match the deinterleave tree pattern
17555 if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2(
17556 m_ExtractValue<0>(m_Specific(DI))))) ||
17557 !match(DeinterleavedValues[1], m_ExtractValue<0>(m_Deinterleave2(
17558 m_ExtractValue<1>(m_Specific(DI))))) ||
17559 !match(DeinterleavedValues[2], m_ExtractValue<1>(m_Deinterleave2(
17560 m_ExtractValue<0>(m_Specific(DI))))) ||
17561 !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2(
17562 m_ExtractValue<1>(m_Specific(DI)))))) {
17563 LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
17564 return false;
17565 }
17566
17567 // These Values will not be used anymore,
17568 // DI4 will be created instead of nested DI1 and DI2
17569 DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
17570 DeinterleavedValues.begin(),
17571 DeinterleavedValues.end());
17572 DeInterleaveDeadInsts.push_back(cast<Instruction>(DI1));
17573 DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr1));
17574 DeInterleaveDeadInsts.push_back(cast<Instruction>(DI2));
17575 DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr2));
17576
17577 return true;
17578}
17579
17581 Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
17582 SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
17583 if (getDeinterleave4Values(DI, DeinterleavedValues, DeInterleaveDeadInsts))
17584 return true;
17585 return getDeinterleave2Values(DI, DeinterleavedValues, DeInterleaveDeadInsts);
17586}
17587
17589 IntrinsicInst *DI, LoadInst *LI,
17590 SmallVectorImpl<Instruction *> &DeadInsts) const {
17591 // Only deinterleave2 supported at present.
17592 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
17593 return false;
17594
17595 SmallVector<Instruction *, 4> DeinterleavedValues;
17596 SmallVector<Instruction *, 8> DeInterleaveDeadInsts;
17597
17598 if (!getDeinterleavedValues(DI, DeinterleavedValues, DeInterleaveDeadInsts)) {
17599 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17600 return false;
17601 }
17602 unsigned Factor = DeinterleavedValues.size();
17603 assert((Factor == 2 || Factor == 4) &&
17604 "Currently supported Factor is 2 or 4 only");
17605 VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
17606
17607 const DataLayout &DL = DI->getModule()->getDataLayout();
17608 bool UseScalable;
17609 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17610 return false;
17611
17612 // TODO: Add support for using SVE instructions with fixed types later, using
17613 // the code from lowerInterleavedLoad to obtain the correct container type.
17614 if (UseScalable && !VTy->isScalableTy())
17615 return false;
17616
17617 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17618 VectorType *LdTy =
17620 VTy->getElementCount().divideCoefficientBy(NumLoads));
17621
17622 Type *PtrTy = LI->getPointerOperandType();
17623 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
17624 UseScalable, LdTy, PtrTy);
17625
17626 IRBuilder<> Builder(LI);
17627 Value *Pred = nullptr;
17628 if (UseScalable)
17629 Pred =
17630 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17631
17632 Value *BaseAddr = LI->getPointerOperand();
17633 if (NumLoads > 1) {
17634 // Create multiple legal small ldN.
17635 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
17636 for (unsigned I = 0; I < NumLoads; ++I) {
17637 Value *Offset = Builder.getInt64(I * Factor);
17638
17639 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17640 Value *LdN = nullptr;
17641 if (UseScalable)
17642 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17643 else
17644 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17645 Value *Idx =
17646 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17647 for (unsigned J = 0; J < Factor; ++J) {
17648 ExtractedLdValues[J] = Builder.CreateInsertVector(
17649 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
17650 }
17651 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17652 }
17653 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17654 for (unsigned J = 0; J < Factor; ++J)
17655 DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17656 } else {
17657 Value *Result;
17658 if (UseScalable)
17659 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17660 else
17661 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17662 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17663 for (unsigned I = 0; I < Factor; I++) {
17664 Value *NewExtract = Builder.CreateExtractValue(Result, I);
17665 DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17666 }
17667 }
17668 DeadInsts.insert(DeadInsts.end(), DeInterleaveDeadInsts.begin(),
17669 DeInterleaveDeadInsts.end());
17670 return true;
17671}
17672
17673/*
17674InterleaveIntrinsic tree.
17675 A C B D
17676 \ / \ /
17677 [II] [II]
17678 \ /
17679 [II]
17680
17681values in correct order of interleave4: A B C D.
17682Returns true if `II` is the root of an IR tree that represents a theoretical
17683vector.interleave4 intrinsic. When true is returned, \p `InterleavedValues`
17684vector is populated with the inputs such an intrinsic would take: (i.e.
17685vector.interleave4(A, B, C, D)).
17686*/
17688 Value *II, SmallVectorImpl<Value *> &InterleavedValues,
17689 SmallVectorImpl<Instruction *> &InterleaveDeadInsts) {
17690 Value *A, *B, *C, *D;
17691 // Try to match interleave of Factor 4
17694 InterleavedValues.push_back(A);
17695 InterleavedValues.push_back(B);
17696 InterleavedValues.push_back(C);
17697 InterleavedValues.push_back(D);
17698 // intermediate II will not be needed anymore
17699 InterleaveDeadInsts.push_back(
17700 cast<Instruction>(cast<Instruction>(II)->getOperand(0)));
17701 InterleaveDeadInsts.push_back(
17702 cast<Instruction>(cast<Instruction>(II)->getOperand(1)));
17703 return true;
17704 }
17705
17706 // Try to match interleave of Factor 2
17707 if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) {
17708 InterleavedValues.push_back(A);
17709 InterleavedValues.push_back(B);
17710 return true;
17711 }
17712
17713 return false;
17714}
17715
17718 SmallVectorImpl<Instruction *> &DeadInsts) const {
17719 // Only interleave2 supported at present.
17720 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
17721 return false;
17722
17723 SmallVector<Value *, 4> InterleavedValues;
17724 SmallVector<Instruction *, 2> InterleaveDeadInsts;
17725 if (!getValuesToInterleave(II, InterleavedValues, InterleaveDeadInsts)) {
17726 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17727 return false;
17728 }
17729 unsigned Factor = InterleavedValues.size();
17730 assert((Factor == 2 || Factor == 4) &&
17731 "Currently supported Factor is 2 or 4 only");
17732 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
17733 const DataLayout &DL = II->getModule()->getDataLayout();
17734
17735 bool UseScalable;
17736 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17737 return false;
17738
17739 // TODO: Add support for using SVE instructions with fixed types later, using
17740 // the code from lowerInterleavedStore to obtain the correct container type.
17741 if (UseScalable && !VTy->isScalableTy())
17742 return false;
17743
17744 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17745
17746 VectorType *StTy =
17748 VTy->getElementCount().divideCoefficientBy(NumStores));
17749
17750 Type *PtrTy = SI->getPointerOperandType();
17751 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17752 UseScalable, StTy, PtrTy);
17753
17754 IRBuilder<> Builder(SI);
17755
17756 Value *BaseAddr = SI->getPointerOperand();
17757 Value *Pred = nullptr;
17758
17759 if (UseScalable)
17760 Pred =
17761 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
17762
17763 auto ExtractedValues = InterleavedValues;
17764 if (UseScalable)
17765 InterleavedValues.push_back(Pred);
17766 InterleavedValues.push_back(BaseAddr);
17767 for (unsigned I = 0; I < NumStores; ++I) {
17768 Value *Address = BaseAddr;
17769 if (NumStores > 1) {
17770 Value *Offset = Builder.getInt64(I * Factor);
17771 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
17772 Value *Idx =
17773 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
17774 for (unsigned J = 0; J < Factor; J++) {
17775 InterleavedValues[J] =
17776 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
17777 }
17778 // update the address
17779 InterleavedValues[InterleavedValues.size() - 1] = Address;
17780 }
17781 Builder.CreateCall(StNFunc, InterleavedValues);
17782 }
17783 DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(),
17784 InterleaveDeadInsts.end());
17785 return true;
17786}
17787
17789 const MemOp &Op, const AttributeList &FuncAttributes) const {
17790 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17791 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17792 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17793 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17794 // taken one instruction to materialize the v2i64 zero and one store (with
17795 // restrictive addressing mode). Just do i64 stores.
17796 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17797 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17798 if (Op.isAligned(AlignCheck))
17799 return true;
17800 unsigned Fast;
17801 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17803 Fast;
17804 };
17805
17806 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17807 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17808 return MVT::v16i8;
17809 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17810 return MVT::f128;
17811 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17812 return MVT::i64;
17813 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17814 return MVT::i32;
17815 return MVT::Other;
17816}
17817
17819 const MemOp &Op, const AttributeList &FuncAttributes) const {
17820 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17821 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17822 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17823 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17824 // taken one instruction to materialize the v2i64 zero and one store (with
17825 // restrictive addressing mode). Just do i64 stores.
17826 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17827 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17828 if (Op.isAligned(AlignCheck))
17829 return true;
17830 unsigned Fast;
17831 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17833 Fast;
17834 };
17835
17836 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17837 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17838 return LLT::fixed_vector(2, 64);
17839 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17840 return LLT::scalar(128);
17841 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17842 return LLT::scalar(64);
17843 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17844 return LLT::scalar(32);
17845 return LLT();
17846}
17847
17848// 12-bit optionally shifted immediates are legal for adds.
17850 if (Immed == std::numeric_limits<int64_t>::min()) {
17851 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17852 << ": avoid UB for INT64_MIN\n");
17853 return false;
17854 }
17855 // Same encoding for add/sub, just flip the sign.
17856 Immed = std::abs(Immed);
17857 bool IsLegal = ((Immed >> 12) == 0 ||
17858 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17859 LLVM_DEBUG(dbgs() << "Is " << Immed
17860 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17861 return IsLegal;
17862}
17863
17865 // We will only emit addvl/inc* instructions for SVE2
17866 if (!Subtarget->hasSVE2())
17867 return false;
17868
17869 // addvl's immediates are in terms of the number of bytes in a register.
17870 // Since there are 16 in the base supported size (128bits), we need to
17871 // divide the immediate by that much to give us a useful immediate to
17872 // multiply by vscale. We can't have a remainder as a result of this.
17873 if (Imm % 16 == 0)
17874 return isInt<6>(Imm / 16);
17875
17876 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17877 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17878 // of addvl as a result, so only take h|w|d into account.
17879 // Dec[h|w|d] will cover subtractions.
17880 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17881 // FIXME: Can we make use of other patterns to cover other immediates?
17882
17883 // inch|dech
17884 if (Imm % 8 == 0)
17885 return std::abs(Imm / 8) <= 16;
17886 // incw|decw
17887 if (Imm % 4 == 0)
17888 return std::abs(Imm / 4) <= 16;
17889 // incd|decd
17890 if (Imm % 2 == 0)
17891 return std::abs(Imm / 2) <= 16;
17892
17893 return false;
17894}
17895
17896// Return false to prevent folding
17897// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17898// if the folding leads to worse code.
17900 SDValue AddNode, SDValue ConstNode) const {
17901 // Let the DAGCombiner decide for vector types and large types.
17902 const EVT VT = AddNode.getValueType();
17903 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17904 return true;
17905
17906 // It is worse if c1 is legal add immediate, while c1*c2 is not
17907 // and has to be composed by at least two instructions.
17908 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17909 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17910 const int64_t C1 = C1Node->getSExtValue();
17911 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17913 return true;
17915 // Adapt to the width of a register.
17916 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17918 if (Insn.size() > 1)
17919 return false;
17920
17921 // Default to true and let the DAGCombiner decide.
17922 return true;
17923}
17924
17925// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17926// immediates is the same as for an add or a sub.
17928 return isLegalAddImmediate(Immed);
17929}
17930
17931/// isLegalAddressingMode - Return true if the addressing mode represented
17932/// by AM is legal for this target, for a load/store of the specified type.
17934 const AddrMode &AMode, Type *Ty,
17935 unsigned AS, Instruction *I) const {
17936 // AArch64 has five basic addressing modes:
17937 // reg
17938 // reg + 9-bit signed offset
17939 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17940 // reg1 + reg2
17941 // reg + SIZE_IN_BYTES * reg
17942
17943 // No global is ever allowed as a base.
17944 if (AMode.BaseGV)
17945 return false;
17946
17947 // No reg+reg+imm addressing.
17948 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17949 return false;
17950
17951 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17952 // `2*ScaledReg` into `BaseReg + ScaledReg`
17953 AddrMode AM = AMode;
17954 if (AM.Scale && !AM.HasBaseReg) {
17955 if (AM.Scale == 1) {
17956 AM.HasBaseReg = true;
17957 AM.Scale = 0;
17958 } else if (AM.Scale == 2) {
17959 AM.HasBaseReg = true;
17960 AM.Scale = 1;
17961 } else {
17962 return false;
17963 }
17964 }
17965
17966 // A base register is required in all addressing modes.
17967 if (!AM.HasBaseReg)
17968 return false;
17969
17970 if (Ty->isScalableTy()) {
17971 if (isa<ScalableVectorType>(Ty)) {
17972 // See if we have a foldable vscale-based offset, for vector types which
17973 // are either legal or smaller than the minimum; more work will be
17974 // required if we need to consider addressing for types which need
17975 // legalization by splitting.
17976 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17977 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17978 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17979 isPowerOf2_64(VecNumBytes))
17980 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17981
17982 uint64_t VecElemNumBytes =
17983 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17984 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17985 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17986 }
17987
17988 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17989 }
17990
17991 // No scalable offsets allowed for non-scalable types.
17992 if (AM.ScalableOffset)
17993 return false;
17994
17995 // check reg + imm case:
17996 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17997 uint64_t NumBytes = 0;
17998 if (Ty->isSized()) {
17999 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18000 NumBytes = NumBits / 8;
18001 if (!isPowerOf2_64(NumBits))
18002 NumBytes = 0;
18003 }
18004
18005 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18006 AM.Scale);
18007}
18008
18009// Check whether the 2 offsets belong to the same imm24 range, and their high
18010// 12bits are same, then their high part can be decoded with the offset of add.
18011int64_t
18013 int64_t MaxOffset) const {
18014 int64_t HighPart = MinOffset & ~0xfffULL;
18015 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18016 // Rebase the value to an integer multiple of imm12.
18017 return HighPart;
18018 }
18019
18020 return 0;
18021}
18022
18024 // Consider splitting large offset of struct or array.
18025 return true;
18026}
18027
18029 const MachineFunction &MF, EVT VT) const {
18030 VT = VT.getScalarType();
18031
18032 if (!VT.isSimple())
18033 return false;
18034
18035 switch (VT.getSimpleVT().SimpleTy) {
18036 case MVT::f16:
18037 return Subtarget->hasFullFP16();
18038 case MVT::f32:
18039 case MVT::f64:
18040 return true;
18041 default:
18042 break;
18043 }
18044
18045 return false;
18046}
18047
18049 Type *Ty) const {
18050 switch (Ty->getScalarType()->getTypeID()) {
18051 case Type::FloatTyID:
18052 case Type::DoubleTyID:
18053 return true;
18054 default:
18055 return false;
18056 }
18057}
18058
18060 EVT VT, CodeGenOptLevel OptLevel) const {
18061 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18063}
18064
18065const MCPhysReg *
18067 // LR is a callee-save register, but we must treat it as clobbered by any call
18068 // site. Hence we include LR in the scratch registers, which are in turn added
18069 // as implicit-defs for stackmaps and patchpoints.
18070 static const MCPhysReg ScratchRegs[] = {
18071 AArch64::X16, AArch64::X17, AArch64::LR, 0
18072 };
18073 return ScratchRegs;
18074}
18075
18077 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18078 return RCRegs;
18079}
18080
18081bool
18083 CombineLevel Level) const {
18084 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18085 N->getOpcode() == ISD::SRL) &&
18086 "Expected shift op");
18087
18088 SDValue ShiftLHS = N->getOperand(0);
18089 EVT VT = N->getValueType(0);
18090
18091 if (!ShiftLHS->hasOneUse())
18092 return false;
18093
18094 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18095 !ShiftLHS.getOperand(0)->hasOneUse())
18096 return false;
18097
18098 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18099 // combine it with shift 'N' to let it be lowered to UBFX except:
18100 // ((x >> C) & mask) << C.
18101 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18102 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18103 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18104 if (isMask_64(TruncMask)) {
18105 SDValue AndLHS = ShiftLHS.getOperand(0);
18106 if (AndLHS.getOpcode() == ISD::SRL) {
18107 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18108 if (N->getOpcode() == ISD::SHL)
18109 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18110 return SRLC->getZExtValue() == SHLC->getZExtValue();
18111 return false;
18112 }
18113 }
18114 }
18115 }
18116 return true;
18117}
18118
18120 const SDNode *N) const {
18121 assert(N->getOpcode() == ISD::XOR &&
18122 (N->getOperand(0).getOpcode() == ISD::SHL ||
18123 N->getOperand(0).getOpcode() == ISD::SRL) &&
18124 "Expected XOR(SHIFT) pattern");
18125
18126 // Only commute if the entire NOT mask is a hidden shifted mask.
18127 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18128 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18129 if (XorC && ShiftC) {
18130 unsigned MaskIdx, MaskLen;
18131 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18132 unsigned ShiftAmt = ShiftC->getZExtValue();
18133 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18134 if (N->getOperand(0).getOpcode() == ISD::SHL)
18135 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18136 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18137 }
18138 }
18139
18140 return false;
18141}
18142
18144 const SDNode *N, CombineLevel Level) const {
18145 assert(((N->getOpcode() == ISD::SHL &&
18146 N->getOperand(0).getOpcode() == ISD::SRL) ||
18147 (N->getOpcode() == ISD::SRL &&
18148 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18149 "Expected shift-shift mask");
18150 // Don't allow multiuse shift folding with the same shift amount.
18151 if (!N->getOperand(0)->hasOneUse())
18152 return false;
18153
18154 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18155 EVT VT = N->getValueType(0);
18156 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18157 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18158 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18159 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18160 }
18161
18162 // We do not need to fold when this shifting used in specific load case:
18163 // (ldr x, (add x, (shl (srl x, c1) 2)))
18164 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18165 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18166 unsigned ShlAmt = C2->getZExtValue();
18167 if (auto ShouldADD = *N->user_begin();
18168 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18169 if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18170 unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
18171 if ((1ULL << ShlAmt) == ByteVT &&
18172 isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
18173 return false;
18174 }
18175 }
18176 }
18177 }
18178
18179 return true;
18180}
18181
18183 unsigned BinOpcode, EVT VT) const {
18184 return VT.isScalableVector() && isTypeLegal(VT);
18185}
18186
18188 Type *Ty) const {
18189 assert(Ty->isIntegerTy());
18190
18191 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18192 if (BitSize == 0)
18193 return false;
18194
18195 int64_t Val = Imm.getSExtValue();
18196 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18197 return true;
18198
18199 if ((int64_t)Val < 0)
18200 Val = ~Val;
18201 if (BitSize == 32)
18202 Val &= (1LL << 32) - 1;
18203
18204 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18205 // MOVZ is free so return true for one or fewer MOVK.
18206 return Shift < 3;
18207}
18208
18210 unsigned Index) const {
18212 return false;
18213
18214 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18215}
18216
18217/// Turn vector tests of the signbit in the form of:
18218/// xor (sra X, elt_size(X)-1), -1
18219/// into:
18220/// cmge X, X, #0
18222 const AArch64Subtarget *Subtarget) {
18223 EVT VT = N->getValueType(0);
18224 if (!Subtarget->hasNEON() || !VT.isVector())
18225 return SDValue();
18226
18227 // There must be a shift right algebraic before the xor, and the xor must be a
18228 // 'not' operation.
18229 SDValue Shift = N->getOperand(0);
18230 SDValue Ones = N->getOperand(1);
18231 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18233 return SDValue();
18234
18235 // The shift should be smearing the sign bit across each vector element.
18236 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18237 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18238 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18239 return SDValue();
18240
18241 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
18242}
18243
18244// Given a vecreduce_add node, detect the below pattern and convert it to the
18245// node sequence with UABDL, [S|U]ADB and UADDLP.
18246//
18247// i32 vecreduce_add(
18248// v16i32 abs(
18249// v16i32 sub(
18250// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18251// =================>
18252// i32 vecreduce_add(
18253// v4i32 UADDLP(
18254// v8i16 add(
18255// v8i16 zext(
18256// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18257// v8i16 zext(
18258// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18260 SelectionDAG &DAG) {
18261 // Assumed i32 vecreduce_add
18262 if (N->getValueType(0) != MVT::i32)
18263 return SDValue();
18264
18265 SDValue VecReduceOp0 = N->getOperand(0);
18266 unsigned Opcode = VecReduceOp0.getOpcode();
18267 // Assumed v16i32 abs
18268 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
18269 return SDValue();
18270
18271 SDValue ABS = VecReduceOp0;
18272 // Assumed v16i32 sub
18273 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18274 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
18275 return SDValue();
18276
18277 SDValue SUB = ABS->getOperand(0);
18278 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18279 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18280 // Assumed v16i32 type
18281 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
18282 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
18283 return SDValue();
18284
18285 // Assumed zext or sext
18286 bool IsZExt = false;
18287 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18288 IsZExt = true;
18289 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18290 IsZExt = false;
18291 } else
18292 return SDValue();
18293
18294 SDValue EXT0 = SUB->getOperand(0);
18295 SDValue EXT1 = SUB->getOperand(1);
18296 // Assumed zext's operand has v16i8 type
18297 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18298 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18299 return SDValue();
18300
18301 // Pattern is dectected. Let's convert it to sequence of nodes.
18302 SDLoc DL(N);
18303
18304 // First, create the node pattern of UABD/SABD.
18305 SDValue UABDHigh8Op0 =
18306 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18307 DAG.getConstant(8, DL, MVT::i64));
18308 SDValue UABDHigh8Op1 =
18309 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18310 DAG.getConstant(8, DL, MVT::i64));
18311 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18312 UABDHigh8Op0, UABDHigh8Op1);
18313 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18314
18315 // Second, create the node pattern of UABAL.
18316 SDValue UABDLo8Op0 =
18317 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18318 DAG.getConstant(0, DL, MVT::i64));
18319 SDValue UABDLo8Op1 =
18320 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18321 DAG.getConstant(0, DL, MVT::i64));
18322 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18323 UABDLo8Op0, UABDLo8Op1);
18324 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18325 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18326
18327 // Third, create the node of UADDLP.
18328 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18329
18330 // Fourth, create the node of VECREDUCE_ADD.
18331 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18332}
18333
18334// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18335// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18336// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18337// If we have vectors larger than v16i8 we extract v16i8 vectors,
18338// Follow the same steps above to get DOT instructions concatenate them
18339// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18341 const AArch64Subtarget *ST) {
18342 if (!ST->isNeonAvailable())
18343 return SDValue();
18344
18345 if (!ST->hasDotProd())
18347
18348 SDValue Op0 = N->getOperand(0);
18349 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18350 Op0.getValueType().getVectorElementType() != MVT::i32)
18351 return SDValue();
18352
18353 unsigned ExtOpcode = Op0.getOpcode();
18354 SDValue A = Op0;
18355 SDValue B;
18356 unsigned DotOpcode;
18357 if (ExtOpcode == ISD::MUL) {
18358 A = Op0.getOperand(0);
18359 B = Op0.getOperand(1);
18360 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18361 return SDValue();
18362 auto OpCodeA = A.getOpcode();
18363 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18364 return SDValue();
18365
18366 auto OpCodeB = B.getOpcode();
18367 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18368 return SDValue();
18369
18370 if (OpCodeA == OpCodeB) {
18371 DotOpcode =
18373 } else {
18374 // Check USDOT support support
18375 if (!ST->hasMatMulInt8())
18376 return SDValue();
18377 DotOpcode = AArch64ISD::USDOT;
18378 if (OpCodeA == ISD::SIGN_EXTEND)
18379 std::swap(A, B);
18380 }
18381 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18382 DotOpcode = AArch64ISD::UDOT;
18383 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18384 DotOpcode = AArch64ISD::SDOT;
18385 } else {
18386 return SDValue();
18387 }
18388
18389 EVT Op0VT = A.getOperand(0).getValueType();
18390 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18391 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18392 if (!IsValidElementCount || !IsValidSize)
18393 return SDValue();
18394
18395 SDLoc DL(Op0);
18396 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18397 // the extend B.
18398 if (!B)
18399 B = DAG.getConstant(1, DL, Op0VT);
18400 else
18401 B = B.getOperand(0);
18402
18403 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18404 unsigned NumOfVecReduce;
18405 EVT TargetType;
18406 if (IsMultipleOf16) {
18407 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18408 TargetType = MVT::v4i32;
18409 } else {
18410 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18411 TargetType = MVT::v2i32;
18412 }
18413 // Handle the case where we need to generate only one Dot operation.
18414 if (NumOfVecReduce == 1) {
18415 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18416 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18417 A.getOperand(0), B);
18418 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18419 }
18420 // Generate Dot instructions that are multiple of 16.
18421 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18422 SmallVector<SDValue, 4> SDotVec16;
18423 unsigned I = 0;
18424 for (; I < VecReduce16Num; I += 1) {
18425 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18426 SDValue Op0 =
18427 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18428 DAG.getConstant(I * 16, DL, MVT::i64));
18429 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18430 DAG.getConstant(I * 16, DL, MVT::i64));
18431 SDValue Dot =
18432 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18433 SDotVec16.push_back(Dot);
18434 }
18435 // Concatenate dot operations.
18436 EVT SDot16EVT =
18437 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18438 SDValue ConcatSDot16 =
18439 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18440 SDValue VecReduceAdd16 =
18441 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18442 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18443 if (VecReduce8Num == 0)
18444 return VecReduceAdd16;
18445
18446 // Generate the remainder Dot operation that is multiple of 8.
18447 SmallVector<SDValue, 4> SDotVec8;
18448 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18449 SDValue Vec8Op0 =
18450 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18451 DAG.getConstant(I * 16, DL, MVT::i64));
18452 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18453 DAG.getConstant(I * 16, DL, MVT::i64));
18454 SDValue Dot =
18455 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18456 SDValue VecReudceAdd8 =
18457 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18458 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18459 VecReudceAdd8);
18460}
18461
18462// Given an (integer) vecreduce, we know the order of the inputs does not
18463// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18464// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18465// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18467 auto DetectAddExtract = [&](SDValue A) {
18468 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18469 // UADDLP(x) if found.
18470 assert(A.getOpcode() == ISD::ADD);
18471 EVT VT = A.getValueType();
18472 SDValue Op0 = A.getOperand(0);
18473 SDValue Op1 = A.getOperand(1);
18474 if (Op0.getOpcode() != Op1.getOpcode() ||
18475 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18476 Op0.getOpcode() != ISD::SIGN_EXTEND))
18477 return SDValue();
18478 SDValue Ext0 = Op0.getOperand(0);
18479 SDValue Ext1 = Op1.getOperand(0);
18480 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18482 Ext0.getOperand(0) != Ext1.getOperand(0))
18483 return SDValue();
18484 // Check that the type is twice the add types, and the extract are from
18485 // upper/lower parts of the same source.
18487 VT.getVectorNumElements() * 2)
18488 return SDValue();
18489 if ((Ext0.getConstantOperandVal(1) != 0 ||
18491 (Ext1.getConstantOperandVal(1) != 0 ||
18493 return SDValue();
18494 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18496 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
18497 };
18498
18499 if (SDValue R = DetectAddExtract(A))
18500 return R;
18501
18502 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
18503 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
18504 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18505 A.getOperand(1));
18506 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
18507 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
18508 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18509 A.getOperand(0));
18510 return SDValue();
18511}
18512
18513// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18514// UADDLV(concat), where the concat represents the 64-bit zext sources.
18516 // Look for add(zext(64-bit source), zext(64-bit source)), returning
18517 // UADDLV(concat(zext, zext)) if found.
18518 assert(A.getOpcode() == ISD::ADD);
18519 EVT VT = A.getValueType();
18520 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18521 return SDValue();
18522 SDValue Op0 = A.getOperand(0);
18523 SDValue Op1 = A.getOperand(1);
18524 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18525 return SDValue();
18526 SDValue Ext0 = Op0.getOperand(0);
18527 SDValue Ext1 = Op1.getOperand(0);
18528 EVT ExtVT0 = Ext0.getValueType();
18529 EVT ExtVT1 = Ext1.getValueType();
18530 // Check zext VTs are the same and 64-bit length.
18531 if (ExtVT0 != ExtVT1 ||
18532 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18533 return SDValue();
18534 // Get VT for concat of zext sources.
18535 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
18536 SDValue Concat =
18537 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
18538
18539 switch (VT.getSimpleVT().SimpleTy) {
18540 case MVT::v2i64:
18541 case MVT::v4i32:
18542 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
18543 case MVT::v8i16: {
18544 SDValue Uaddlv =
18545 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
18546 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
18547 }
18548 default:
18549 llvm_unreachable("Unhandled vector type");
18550 }
18551}
18552
18554 SDValue A = N->getOperand(0);
18555 if (A.getOpcode() == ISD::ADD) {
18556 if (SDValue R = performUADDVAddCombine(A, DAG))
18557 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
18558 else if (SDValue R = performUADDVZextCombine(A, DAG))
18559 return R;
18560 }
18561 return SDValue();
18562}
18563
18566 const AArch64Subtarget *Subtarget) {
18567 if (DCI.isBeforeLegalizeOps())
18568 return SDValue();
18569
18570 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18571}
18572
18573SDValue
18574AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18575 SelectionDAG &DAG,
18576 SmallVectorImpl<SDNode *> &Created) const {
18578 if (isIntDivCheap(N->getValueType(0), Attr))
18579 return SDValue(N, 0); // Lower SDIV as SDIV
18580
18581 EVT VT = N->getValueType(0);
18582
18583 // For scalable and fixed types, mark them as cheap so we can handle it much
18584 // later. This allows us to handle larger than legal types.
18585 if (VT.isScalableVector() ||
18586 (VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
18587 return SDValue(N, 0);
18588
18589 // fold (sdiv X, pow2)
18590 if ((VT != MVT::i32 && VT != MVT::i64) ||
18591 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18592 return SDValue();
18593
18594 // If the divisor is 2 or -2, the default expansion is better. It will add
18595 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18596 if (Divisor == 2 ||
18597 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18598 return SDValue();
18599
18600 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18601}
18602
18603SDValue
18604AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18605 SelectionDAG &DAG,
18606 SmallVectorImpl<SDNode *> &Created) const {
18608 if (isIntDivCheap(N->getValueType(0), Attr))
18609 return SDValue(N, 0); // Lower SREM as SREM
18610
18611 EVT VT = N->getValueType(0);
18612
18613 // For scalable and fixed types, mark them as cheap so we can handle it much
18614 // later. This allows us to handle larger than legal types.
18615 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18616 return SDValue(N, 0);
18617
18618 // fold (srem X, pow2)
18619 if ((VT != MVT::i32 && VT != MVT::i64) ||
18620 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18621 return SDValue();
18622
18623 unsigned Lg2 = Divisor.countr_zero();
18624 if (Lg2 == 0)
18625 return SDValue();
18626
18627 SDLoc DL(N);
18628 SDValue N0 = N->getOperand(0);
18629 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
18630 SDValue Zero = DAG.getConstant(0, DL, VT);
18631 SDValue CCVal, CSNeg;
18632 if (Lg2 == 1) {
18633 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
18634 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18635 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
18636
18637 Created.push_back(Cmp.getNode());
18638 Created.push_back(And.getNode());
18639 } else {
18640 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
18641 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18642
18643 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
18644 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18645 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
18646 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
18647 Negs.getValue(1));
18648
18649 Created.push_back(Negs.getNode());
18650 Created.push_back(AndPos.getNode());
18651 Created.push_back(AndNeg.getNode());
18652 }
18653
18654 return CSNeg;
18655}
18656
18657static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18658 switch(getIntrinsicID(S.getNode())) {
18659 default:
18660 break;
18661 case Intrinsic::aarch64_sve_cntb:
18662 return 8;
18663 case Intrinsic::aarch64_sve_cnth:
18664 return 16;
18665 case Intrinsic::aarch64_sve_cntw:
18666 return 32;
18667 case Intrinsic::aarch64_sve_cntd:
18668 return 64;
18669 }
18670 return {};
18671}
18672
18673/// Calculates what the pre-extend type is, based on the extension
18674/// operation node provided by \p Extend.
18675///
18676/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18677/// pre-extend type is pulled directly from the operand, while other extend
18678/// operations need a bit more inspection to get this information.
18679///
18680/// \param Extend The SDNode from the DAG that represents the extend operation
18681///
18682/// \returns The type representing the \p Extend source type, or \p MVT::Other
18683/// if no valid type can be determined
18685 switch (Extend.getOpcode()) {
18686 case ISD::SIGN_EXTEND:
18687 case ISD::ZERO_EXTEND:
18688 case ISD::ANY_EXTEND:
18689 return Extend.getOperand(0).getValueType();
18690 case ISD::AssertSext:
18691 case ISD::AssertZext:
18693 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
18694 if (!TypeNode)
18695 return MVT::Other;
18696 return TypeNode->getVT();
18697 }
18698 case ISD::AND: {
18700 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
18701 if (!Constant)
18702 return MVT::Other;
18703
18704 uint32_t Mask = Constant->getZExtValue();
18705
18706 if (Mask == UCHAR_MAX)
18707 return MVT::i8;
18708 else if (Mask == USHRT_MAX)
18709 return MVT::i16;
18710 else if (Mask == UINT_MAX)
18711 return MVT::i32;
18712
18713 return MVT::Other;
18714 }
18715 default:
18716 return MVT::Other;
18717 }
18718}
18719
18720/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18721/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18722/// SExt/ZExt rather than the scalar SExt/ZExt
18724 EVT VT = BV.getValueType();
18725 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18727 return SDValue();
18728
18729 // Use the first item in the buildvector/shuffle to get the size of the
18730 // extend, and make sure it looks valid.
18731 SDValue Extend = BV->getOperand(0);
18732 unsigned ExtendOpcode = Extend.getOpcode();
18733 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18734 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18735 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18736 ExtendOpcode == ISD::AssertSext;
18737 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18738 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18739 return SDValue();
18740 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18741 // ensure calculatePreExtendType will work without issue.
18742 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18743 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18744 return SDValue();
18745
18746 // Restrict valid pre-extend data type
18747 EVT PreExtendType = calculatePreExtendType(Extend);
18748 if (PreExtendType == MVT::Other ||
18749 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18750 return SDValue();
18751
18752 // Make sure all other operands are equally extended.
18753 bool SeenZExtOrSExt = !IsAnyExt;
18754 for (SDValue Op : drop_begin(BV->ops())) {
18755 if (Op.isUndef())
18756 continue;
18757
18758 if (calculatePreExtendType(Op) != PreExtendType)
18759 return SDValue();
18760
18761 unsigned Opc = Op.getOpcode();
18762 if (Opc == ISD::ANY_EXTEND)
18763 continue;
18764
18765 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18766 Opc == ISD::AssertSext;
18767
18768 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
18769 return SDValue();
18770
18771 IsSExt = OpcIsSExt;
18772 SeenZExtOrSExt = true;
18773 }
18774
18775 SDValue NBV;
18776 SDLoc DL(BV);
18777 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18778 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
18779 EVT PreExtendLegalType =
18780 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18782 for (SDValue Op : BV->ops())
18783 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
18784 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
18785 PreExtendLegalType));
18786 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
18787 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18788 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
18789 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
18790 BV.getOperand(1).isUndef()
18791 ? DAG.getUNDEF(PreExtendVT)
18792 : BV.getOperand(1).getOperand(0),
18793 cast<ShuffleVectorSDNode>(BV)->getMask());
18794 }
18795 unsigned ExtOpc = !SeenZExtOrSExt
18797 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
18798 return DAG.getNode(ExtOpc, DL, VT, NBV);
18799}
18800
18801/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18802/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18804 // If the value type isn't a vector, none of the operands are going to be dups
18805 EVT VT = Mul->getValueType(0);
18806 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18807 return SDValue();
18808
18809 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
18810 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
18811
18812 // Neither operands have been changed, don't make any further changes
18813 if (!Op0 && !Op1)
18814 return SDValue();
18815
18816 SDLoc DL(Mul);
18817 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
18818 Op1 ? Op1 : Mul->getOperand(1));
18819}
18820
18821// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18822// Same for other types with equivalent constants.
18824 EVT VT = N->getValueType(0);
18825 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18826 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18827 return SDValue();
18828 if (N->getOperand(0).getOpcode() != ISD::AND ||
18829 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
18830 return SDValue();
18831
18832 SDValue And = N->getOperand(0);
18833 SDValue Srl = And.getOperand(0);
18834
18835 APInt V1, V2, V3;
18836 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
18837 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
18839 return SDValue();
18840
18841 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18842 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18843 V3 != (HalfSize - 1))
18844 return SDValue();
18845
18846 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18847 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
18848 VT.getVectorElementCount() * 2);
18849
18850 SDLoc DL(N);
18851 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
18852 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
18853 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
18854}
18855
18856// Transform vector add(zext i8 to i32, zext i8 to i32)
18857// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18858// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18859// extends.
18861 EVT VT = N->getValueType(0);
18862 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18863 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
18864 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
18865 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
18866 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
18867 N->getOperand(0).getOperand(0).getValueType() !=
18868 N->getOperand(1).getOperand(0).getValueType())
18869 return SDValue();
18870
18871 if (N->getOpcode() == ISD::MUL &&
18872 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
18873 return SDValue();
18874
18875 SDValue N0 = N->getOperand(0).getOperand(0);
18876 SDValue N1 = N->getOperand(1).getOperand(0);
18877 EVT InVT = N0.getValueType();
18878
18879 EVT S1 = InVT.getScalarType();
18880 EVT S2 = VT.getScalarType();
18881 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18882 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18883 SDLoc DL(N);
18884 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18887 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
18888 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
18889 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
18890 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
18891 : (unsigned)ISD::SIGN_EXTEND,
18892 DL, VT, NewOp);
18893 }
18894 return SDValue();
18895}
18896
18899 const AArch64Subtarget *Subtarget) {
18900
18901 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
18902 return Ext;
18904 return Ext;
18905 if (SDValue Ext = performVectorExtCombine(N, DAG))
18906 return Ext;
18907
18908 if (DCI.isBeforeLegalizeOps())
18909 return SDValue();
18910
18911 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18912 // and in MachineCombiner pass, add+mul will be combined into madd.
18913 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18914 SDLoc DL(N);
18915 EVT VT = N->getValueType(0);
18916 SDValue N0 = N->getOperand(0);
18917 SDValue N1 = N->getOperand(1);
18918 SDValue MulOper;
18919 unsigned AddSubOpc;
18920
18921 auto IsAddSubWith1 = [&](SDValue V) -> bool {
18922 AddSubOpc = V->getOpcode();
18923 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18924 SDValue Opnd = V->getOperand(1);
18925 MulOper = V->getOperand(0);
18926 if (AddSubOpc == ISD::SUB)
18927 std::swap(Opnd, MulOper);
18928 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18929 return C->isOne();
18930 }
18931 return false;
18932 };
18933
18934 if (IsAddSubWith1(N0)) {
18935 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18936 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18937 }
18938
18939 if (IsAddSubWith1(N1)) {
18940 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18941 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18942 }
18943
18944 // The below optimizations require a constant RHS.
18945 if (!isa<ConstantSDNode>(N1))
18946 return SDValue();
18947
18948 ConstantSDNode *C = cast<ConstantSDNode>(N1);
18949 const APInt &ConstValue = C->getAPIntValue();
18950
18951 // Allow the scaling to be folded into the `cnt` instruction by preventing
18952 // the scaling to be obscured here. This makes it easier to pattern match.
18953 if (IsSVECntIntrinsic(N0) ||
18954 (N0->getOpcode() == ISD::TRUNCATE &&
18955 (IsSVECntIntrinsic(N0->getOperand(0)))))
18956 if (ConstValue.sge(1) && ConstValue.sle(16))
18957 return SDValue();
18958
18959 // Multiplication of a power of two plus/minus one can be done more
18960 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18961 // future CPUs have a cheaper MADD instruction, this may need to be
18962 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18963 // 64-bit is 5 cycles, so this is always a win.
18964 // More aggressively, some multiplications N0 * C can be lowered to
18965 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18966 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18967 // TODO: lower more cases.
18968
18969 // TrailingZeroes is used to test if the mul can be lowered to
18970 // shift+add+shift.
18971 unsigned TrailingZeroes = ConstValue.countr_zero();
18972 if (TrailingZeroes) {
18973 // Conservatively do not lower to shift+add+shift if the mul might be
18974 // folded into smul or umul.
18975 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18976 isZeroExtended(N0, DAG)))
18977 return SDValue();
18978 // Conservatively do not lower to shift+add+shift if the mul might be
18979 // folded into madd or msub.
18980 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
18981 N->user_begin()->getOpcode() == ISD::SUB))
18982 return SDValue();
18983 }
18984 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18985 // and shift+add+shift.
18986 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18987 unsigned ShiftAmt;
18988
18989 auto Shl = [&](SDValue N0, unsigned N1) {
18990 if (!N0.getNode())
18991 return SDValue();
18992 // If shift causes overflow, ignore this combine.
18993 if (N1 >= N0.getValueSizeInBits())
18994 return SDValue();
18995 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18996 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18997 };
18998 auto Add = [&](SDValue N0, SDValue N1) {
18999 if (!N0.getNode() || !N1.getNode())
19000 return SDValue();
19001 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19002 };
19003 auto Sub = [&](SDValue N0, SDValue N1) {
19004 if (!N0.getNode() || !N1.getNode())
19005 return SDValue();
19006 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19007 };
19008 auto Negate = [&](SDValue N) {
19009 if (!N0.getNode())
19010 return SDValue();
19011 SDValue Zero = DAG.getConstant(0, DL, VT);
19012 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19013 };
19014
19015 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19016 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19017 // the (2^N - 1) can't be execused via a single instruction.
19018 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19019 unsigned BitWidth = C.getBitWidth();
19020 for (unsigned i = 1; i < BitWidth / 2; i++) {
19021 APInt Rem;
19022 APInt X(BitWidth, (1 << i) + 1);
19023 APInt::sdivrem(C, X, N, Rem);
19024 APInt NVMinus1 = N - 1;
19025 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19026 M = X;
19027 return true;
19028 }
19029 }
19030 return false;
19031 };
19032
19033 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19034 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19035 // the (2^N - 1) can't be execused via a single instruction.
19036 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19037 APInt CVMinus1 = C - 1;
19038 if (CVMinus1.isNegative())
19039 return false;
19040 unsigned TrailingZeroes = CVMinus1.countr_zero();
19041 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19042 if (SCVMinus1.isPowerOf2()) {
19043 unsigned BitWidth = SCVMinus1.getBitWidth();
19044 M = APInt(BitWidth, SCVMinus1.logBase2());
19045 N = APInt(BitWidth, TrailingZeroes);
19046 return true;
19047 }
19048 return false;
19049 };
19050
19051 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19052 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19053 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19054 APInt CVMinus1 = C - 1;
19055 if (CVMinus1.isNegative())
19056 return false;
19057 unsigned TrailingZeroes = CVMinus1.countr_zero();
19058 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19059 if (CVPlus1.isPowerOf2()) {
19060 unsigned BitWidth = CVPlus1.getBitWidth();
19061 M = APInt(BitWidth, CVPlus1.logBase2());
19062 N = APInt(BitWidth, TrailingZeroes);
19063 return true;
19064 }
19065 return false;
19066 };
19067
19068 if (ConstValue.isNonNegative()) {
19069 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19070 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19071 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19072 // (mul x, (2^M + 1) * (2^N + 1))
19073 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19074 // (mul x, (2^M + 1) * 2^N + 1))
19075 // => MV = add (shl x, M), x); add (shl MV, N), x)
19076 // (mul x, 1 - (1 - 2^M) * 2^N))
19077 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19078 APInt SCVMinus1 = ShiftedConstValue - 1;
19079 APInt SCVPlus1 = ShiftedConstValue + 1;
19080 APInt CVPlus1 = ConstValue + 1;
19081 APInt CVM, CVN;
19082 if (SCVMinus1.isPowerOf2()) {
19083 ShiftAmt = SCVMinus1.logBase2();
19084 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19085 } else if (CVPlus1.isPowerOf2()) {
19086 ShiftAmt = CVPlus1.logBase2();
19087 return Sub(Shl(N0, ShiftAmt), N0);
19088 } else if (SCVPlus1.isPowerOf2()) {
19089 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19090 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19091 }
19092 if (Subtarget->hasALULSLFast() &&
19093 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19094 APInt CVMMinus1 = CVM - 1;
19095 APInt CVNMinus1 = CVN - 1;
19096 unsigned ShiftM1 = CVMMinus1.logBase2();
19097 unsigned ShiftN1 = CVNMinus1.logBase2();
19098 // ALULSLFast implicate that Shifts <= 4 places are fast
19099 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19100 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19101 return Add(Shl(MVal, ShiftN1), MVal);
19102 }
19103 }
19104 if (Subtarget->hasALULSLFast() &&
19105 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19106 unsigned ShiftM = CVM.getZExtValue();
19107 unsigned ShiftN = CVN.getZExtValue();
19108 // ALULSLFast implicate that Shifts <= 4 places are fast
19109 if (ShiftM <= 4 && ShiftN <= 4) {
19110 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19111 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19112 }
19113 }
19114
19115 if (Subtarget->hasALULSLFast() &&
19116 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19117 unsigned ShiftM = CVM.getZExtValue();
19118 unsigned ShiftN = CVN.getZExtValue();
19119 // ALULSLFast implicate that Shifts <= 4 places are fast
19120 if (ShiftM <= 4 && ShiftN <= 4) {
19121 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19122 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19123 }
19124 }
19125 } else {
19126 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19127 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19128 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19129 APInt SCVPlus1 = -ShiftedConstValue + 1;
19130 APInt CVNegPlus1 = -ConstValue + 1;
19131 APInt CVNegMinus1 = -ConstValue - 1;
19132 if (CVNegPlus1.isPowerOf2()) {
19133 ShiftAmt = CVNegPlus1.logBase2();
19134 return Sub(N0, Shl(N0, ShiftAmt));
19135 } else if (CVNegMinus1.isPowerOf2()) {
19136 ShiftAmt = CVNegMinus1.logBase2();
19137 return Negate(Add(Shl(N0, ShiftAmt), N0));
19138 } else if (SCVPlus1.isPowerOf2()) {
19139 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19140 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19141 }
19142 }
19143
19144 return SDValue();
19145}
19146
19148 SelectionDAG &DAG) {
19149 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19150 // optimize away operation when it's from a constant.
19151 //
19152 // The general transformation is:
19153 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19154 // AND(VECTOR_CMP(x,y), constant2)
19155 // constant2 = UNARYOP(constant)
19156
19157 // Early exit if this isn't a vector operation, the operand of the
19158 // unary operation isn't a bitwise AND, or if the sizes of the operations
19159 // aren't the same.
19160 EVT VT = N->getValueType(0);
19161 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19162 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19163 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19164 return SDValue();
19165
19166 // Now check that the other operand of the AND is a constant. We could
19167 // make the transformation for non-constant splats as well, but it's unclear
19168 // that would be a benefit as it would not eliminate any operations, just
19169 // perform one more step in scalar code before moving to the vector unit.
19170 if (BuildVectorSDNode *BV =
19171 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19172 // Bail out if the vector isn't a constant.
19173 if (!BV->isConstant())
19174 return SDValue();
19175
19176 // Everything checks out. Build up the new and improved node.
19177 SDLoc DL(N);
19178 EVT IntVT = BV->getValueType(0);
19179 // Create a new constant of the appropriate type for the transformed
19180 // DAG.
19181 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19182 // The AND node needs bitcasts to/from an integer vector type around it.
19183 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19184 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19185 N->getOperand(0)->getOperand(0), MaskConst);
19186 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19187 return Res;
19188 }
19189
19190 return SDValue();
19191}
19192
19193/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19194/// functions, this can help to reduce the number of fmovs to/from GPRs.
19195static SDValue
19198 const AArch64Subtarget *Subtarget) {
19199 if (N->isStrictFPOpcode())
19200 return SDValue();
19201
19202 if (DCI.isBeforeLegalizeOps())
19203 return SDValue();
19204
19205 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19206 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19207 return SDValue();
19208
19209 auto isSupportedType = [](EVT VT) {
19210 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19211 };
19212
19213 SDValue SrcVal = N->getOperand(0);
19214 EVT SrcTy = SrcVal.getValueType();
19215 EVT DestTy = N->getValueType(0);
19216
19217 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19218 return SDValue();
19219
19220 EVT SrcVecTy;
19221 EVT DestVecTy;
19222 if (DestTy.bitsGT(SrcTy)) {
19223 DestVecTy = getPackedSVEVectorVT(DestTy);
19224 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19225 } else {
19226 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19227 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19228 }
19229
19230 // Ensure the resulting src/dest vector type is legal.
19231 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19232 return SDValue();
19233
19234 SDLoc DL(N);
19235 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19236 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19237 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19238 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19239 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19240}
19241
19244 const AArch64Subtarget *Subtarget) {
19245 // First try to optimize away the conversion when it's conditionally from
19246 // a constant. Vectors only.
19248 return Res;
19249
19250 if (SDValue Res =
19251 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19252 return Res;
19253
19254 EVT VT = N->getValueType(0);
19255 if (VT != MVT::f32 && VT != MVT::f64)
19256 return SDValue();
19257
19258 // Only optimize when the source and destination types have the same width.
19259 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19260 return SDValue();
19261
19262 // If the result of an integer load is only used by an integer-to-float
19263 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19264 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19265 SDValue N0 = N->getOperand(0);
19266 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19267 N0.hasOneUse() &&
19268 // Do not change the width of a volatile load.
19269 !cast<LoadSDNode>(N0)->isVolatile()) {
19270 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19271 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19272 LN0->getPointerInfo(), LN0->getAlign(),
19273 LN0->getMemOperand()->getFlags());
19274
19275 // Make sure successors of the original load stay after it by updating them
19276 // to use the new Chain.
19277 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19278
19279 unsigned Opcode =
19281 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19282 }
19283
19284 return SDValue();
19285}
19286
19287/// Fold a floating-point multiply by power of two into floating-point to
19288/// fixed-point conversion.
19291 const AArch64Subtarget *Subtarget) {
19292 if (SDValue Res =
19293 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19294 return Res;
19295
19296 if (!Subtarget->isNeonAvailable())
19297 return SDValue();
19298
19299 if (!N->getValueType(0).isSimple())
19300 return SDValue();
19301
19302 SDValue Op = N->getOperand(0);
19303 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19304 return SDValue();
19305
19306 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19307 return SDValue();
19308
19309 SDValue ConstVec = Op->getOperand(1);
19310 if (!isa<BuildVectorSDNode>(ConstVec))
19311 return SDValue();
19312
19313 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19314 uint32_t FloatBits = FloatTy.getSizeInBits();
19315 if (FloatBits != 32 && FloatBits != 64 &&
19316 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19317 return SDValue();
19318
19319 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19320 uint32_t IntBits = IntTy.getSizeInBits();
19321 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19322 return SDValue();
19323
19324 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19325 if (IntBits > FloatBits)
19326 return SDValue();
19327
19328 BitVector UndefElements;
19329 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
19330 int32_t Bits = IntBits == 64 ? 64 : 32;
19331 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19332 if (C == -1 || C == 0 || C > Bits)
19333 return SDValue();
19334
19335 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19336 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19337 return SDValue();
19338
19339 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19340 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19341 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19342 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19343 return SDValue();
19344 }
19345
19346 SDLoc DL(N);
19347 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19348 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19349 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19350 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19351 SDValue FixConv =
19353 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19354 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19355 // We can handle smaller integers by generating an extra trunc.
19356 if (IntBits < FloatBits)
19357 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19358
19359 return FixConv;
19360}
19361
19363 const AArch64TargetLowering &TLI) {
19364 EVT VT = N->getValueType(0);
19365 SelectionDAG &DAG = DCI.DAG;
19366 SDLoc DL(N);
19367 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19368
19369 if (!VT.isVector())
19370 return SDValue();
19371
19372 if (VT.isScalableVector() && !Subtarget.hasSVE2())
19373 return SDValue();
19374
19375 if (VT.isFixedLengthVector() &&
19376 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
19377 return SDValue();
19378
19379 SDValue N0 = N->getOperand(0);
19380 if (N0.getOpcode() != ISD::AND)
19381 return SDValue();
19382
19383 SDValue N1 = N->getOperand(1);
19384 if (N1.getOpcode() != ISD::AND)
19385 return SDValue();
19386
19387 // InstCombine does (not (neg a)) => (add a -1).
19388 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
19389 // Loop over all combinations of AND operands.
19390 for (int i = 1; i >= 0; --i) {
19391 for (int j = 1; j >= 0; --j) {
19392 SDValue O0 = N0->getOperand(i);
19393 SDValue O1 = N1->getOperand(j);
19394 SDValue Sub, Add, SubSibling, AddSibling;
19395
19396 // Find a SUB and an ADD operand, one from each AND.
19397 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
19398 Sub = O0;
19399 Add = O1;
19400 SubSibling = N0->getOperand(1 - i);
19401 AddSibling = N1->getOperand(1 - j);
19402 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
19403 Add = O0;
19404 Sub = O1;
19405 AddSibling = N0->getOperand(1 - i);
19406 SubSibling = N1->getOperand(1 - j);
19407 } else
19408 continue;
19409
19411 continue;
19412
19413 // Constant ones is always righthand operand of the Add.
19414 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
19415 continue;
19416
19417 if (Sub.getOperand(1) != Add.getOperand(0))
19418 continue;
19419
19420 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
19421 }
19422 }
19423
19424 // (or (and a b) (and (not a) c)) => (bsl a b c)
19425 // We only have to look for constant vectors here since the general, variable
19426 // case can be handled in TableGen.
19427 unsigned Bits = VT.getScalarSizeInBits();
19428 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
19429 for (int i = 1; i >= 0; --i)
19430 for (int j = 1; j >= 0; --j) {
19431 APInt Val1, Val2;
19432
19433 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
19435 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
19436 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19437 N0->getOperand(1 - i), N1->getOperand(1 - j));
19438 }
19439 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
19440 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
19441 if (!BVN0 || !BVN1)
19442 continue;
19443
19444 bool FoundMatch = true;
19445 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
19446 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
19447 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
19448 if (!CN0 || !CN1 ||
19449 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
19450 FoundMatch = false;
19451 break;
19452 }
19453 }
19454 if (FoundMatch)
19455 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19456 N0->getOperand(1 - i), N1->getOperand(1 - j));
19457 }
19458
19459 return SDValue();
19460}
19461
19462// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19463// convert to csel(ccmp(.., cc0)), depending on cc1:
19464
19465// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19466// =>
19467// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19468//
19469// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19470// =>
19471// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19473 EVT VT = N->getValueType(0);
19474 SDValue CSel0 = N->getOperand(0);
19475 SDValue CSel1 = N->getOperand(1);
19476
19477 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19478 CSel1.getOpcode() != AArch64ISD::CSEL)
19479 return SDValue();
19480
19481 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19482 return SDValue();
19483
19484 if (!isNullConstant(CSel0.getOperand(0)) ||
19485 !isOneConstant(CSel0.getOperand(1)) ||
19486 !isNullConstant(CSel1.getOperand(0)) ||
19487 !isOneConstant(CSel1.getOperand(1)))
19488 return SDValue();
19489
19490 SDValue Cmp0 = CSel0.getOperand(3);
19491 SDValue Cmp1 = CSel1.getOperand(3);
19494 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19495 return SDValue();
19496 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19497 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19498 std::swap(Cmp0, Cmp1);
19499 std::swap(CC0, CC1);
19500 }
19501
19502 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19503 return SDValue();
19504
19505 SDLoc DL(N);
19506 SDValue CCmp, Condition;
19507 unsigned NZCV;
19508
19509 if (N->getOpcode() == ISD::AND) {
19511 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
19513 } else {
19515 Condition = DAG.getConstant(CC0, DL, MVT_CC);
19517 }
19518
19519 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19520
19521 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19522 if (Op1 && Op1->getAPIntValue().isNegative() &&
19523 Op1->getAPIntValue().sgt(-32)) {
19524 // CCMP accept the constant int the range [0, 31]
19525 // if the Op1 is a constant in the range [-31, -1], we
19526 // can select to CCMN to avoid the extra mov
19527 SDValue AbsOp1 =
19528 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19529 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
19530 NZCVOp, Condition, Cmp0);
19531 } else {
19532 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
19533 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19534 }
19535 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19536 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
19537 CCmp);
19538}
19539
19541 const AArch64Subtarget *Subtarget,
19542 const AArch64TargetLowering &TLI) {
19543 SelectionDAG &DAG = DCI.DAG;
19544 EVT VT = N->getValueType(0);
19545
19546 if (SDValue R = performANDORCSELCombine(N, DAG))
19547 return R;
19548
19549 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19550 return SDValue();
19551
19552 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
19553 return Res;
19554
19555 return SDValue();
19556}
19557
19559 if (!MemVT.getVectorElementType().isSimple())
19560 return false;
19561
19562 uint64_t MaskForTy = 0ull;
19563 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19564 case MVT::i8:
19565 MaskForTy = 0xffull;
19566 break;
19567 case MVT::i16:
19568 MaskForTy = 0xffffull;
19569 break;
19570 case MVT::i32:
19571 MaskForTy = 0xffffffffull;
19572 break;
19573 default:
19574 return false;
19575 break;
19576 }
19577
19578 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19579 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
19580 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19581
19582 return false;
19583}
19584
19586 SDValue LeafOp = SDValue(N, 0);
19587 SDValue Op = N->getOperand(0);
19588 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19589 LeafOp.getValueType() != Op.getValueType())
19590 Op = Op->getOperand(0);
19591 if (LeafOp.getValueType() == Op.getValueType())
19592 return Op;
19593 return SDValue();
19594}
19595
19598 SelectionDAG &DAG = DCI.DAG;
19599 SDValue Src = N->getOperand(0);
19600 unsigned Opc = Src->getOpcode();
19601
19602 // Zero/any extend of an unsigned unpack
19603 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19604 SDValue UnpkOp = Src->getOperand(0);
19605 SDValue Dup = N->getOperand(1);
19606
19607 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19608 return SDValue();
19609
19610 SDLoc DL(N);
19611 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
19612 if (!C)
19613 return SDValue();
19614
19615 uint64_t ExtVal = C->getZExtValue();
19616
19617 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19618 return ((ExtVal == 0xFF && VT == MVT::i8) ||
19619 (ExtVal == 0xFFFF && VT == MVT::i16) ||
19620 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19621 };
19622
19623 // If the mask is fully covered by the unpack, we don't need to push
19624 // a new AND onto the operand
19625 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
19626 if (MaskAndTypeMatch(EltTy))
19627 return Src;
19628
19629 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19630 // to see if the mask is all-ones of size MemTy.
19631 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
19632 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19633 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19634 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19635 if (MaskAndTypeMatch(EltTy))
19636 return Src;
19637 }
19638
19639 // Truncate to prevent a DUP with an over wide constant
19640 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
19641
19642 // Otherwise, make sure we propagate the AND to the operand
19643 // of the unpack
19644 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
19645 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
19646
19647 SDValue And = DAG.getNode(ISD::AND, DL,
19648 UnpkOp->getValueType(0), UnpkOp, Dup);
19649
19650 return DAG.getNode(Opc, DL, N->getValueType(0), And);
19651 }
19652
19653 if (DCI.isBeforeLegalizeOps())
19654 return SDValue();
19655
19656 // If both sides of AND operations are i1 splat_vectors then
19657 // we can produce just i1 splat_vector as the result.
19658 if (isAllActivePredicate(DAG, N->getOperand(0)))
19659 return N->getOperand(1);
19660 if (isAllActivePredicate(DAG, N->getOperand(1)))
19661 return N->getOperand(0);
19662
19664 return SDValue();
19665
19666 SDValue Mask = N->getOperand(1);
19667
19668 if (!Src.hasOneUse())
19669 return SDValue();
19670
19671 EVT MemVT;
19672
19673 // SVE load instructions perform an implicit zero-extend, which makes them
19674 // perfect candidates for combining.
19675 switch (Opc) {
19679 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
19680 break;
19696 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
19697 break;
19698 default:
19699 return SDValue();
19700 }
19701
19702 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
19703 return Src;
19704
19705 return SDValue();
19706}
19707
19708// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19711
19712 // This function performs an optimization on a specific pattern involving
19713 // an AND operation and SETCC (Set Condition Code) node.
19714
19715 SDValue SetCC = N->getOperand(0);
19716 EVT VT = N->getValueType(0);
19717 SelectionDAG &DAG = DCI.DAG;
19718
19719 // Checks if the current node (N) is used by any SELECT instruction and
19720 // returns an empty SDValue to avoid applying the optimization to prevent
19721 // incorrect results
19722 for (auto U : N->users())
19723 if (U->getOpcode() == ISD::SELECT)
19724 return SDValue();
19725
19726 // Check if the operand is a SETCC node with floating-point comparison
19727 if (SetCC.getOpcode() == ISD::SETCC &&
19728 SetCC.getOperand(0).getValueType() == MVT::f32) {
19729
19730 SDValue Cmp;
19732
19733 // Check if the DAG is after legalization and if we can emit the conjunction
19734 if (!DCI.isBeforeLegalize() &&
19735 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19736
19738
19739 SDLoc DL(N);
19740 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
19741 DAG.getConstant(0, DL, VT),
19742 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
19743 }
19744 }
19745 return SDValue();
19746}
19747
19750 SelectionDAG &DAG = DCI.DAG;
19751 SDValue LHS = N->getOperand(0);
19752 SDValue RHS = N->getOperand(1);
19753 EVT VT = N->getValueType(0);
19754
19755 if (SDValue R = performANDORCSELCombine(N, DAG))
19756 return R;
19757
19758 if (SDValue R = performANDSETCCCombine(N,DCI))
19759 return R;
19760
19761 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19762 return SDValue();
19763
19764 if (VT.isScalableVector())
19765 return performSVEAndCombine(N, DCI);
19766
19767 // The combining code below works only for NEON vectors. In particular, it
19768 // does not work for SVE when dealing with vectors wider than 128 bits.
19769 if (!VT.is64BitVector() && !VT.is128BitVector())
19770 return SDValue();
19771
19772 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
19773 if (!BVN)
19774 return SDValue();
19775
19776 // AND does not accept an immediate, so check if we can use a BIC immediate
19777 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19778 // pattern in isel, because some immediates may be lowered to the preferred
19779 // (and x, (movi imm)) form, even though an mvni representation also exists.
19780 APInt DefBits(VT.getSizeInBits(), 0);
19781 APInt UndefBits(VT.getSizeInBits(), 0);
19782 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
19783 SDValue NewOp;
19784
19785 // Any bits known to already be 0 need not be cleared again, which can help
19786 // reduce the size of the immediate to one supported by the instruction.
19787 KnownBits Known = DAG.computeKnownBits(LHS);
19788 APInt ZeroSplat(VT.getSizeInBits(), 0);
19789 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19790 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
19791 << (Known.Zero.getBitWidth() * I);
19792
19793 DefBits = ~(DefBits | ZeroSplat);
19794 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19795 DefBits, &LHS)) ||
19796 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19797 DefBits, &LHS)))
19798 return NewOp;
19799
19800 UndefBits = ~(UndefBits | ZeroSplat);
19801 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19802 UndefBits, &LHS)) ||
19803 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19804 UndefBits, &LHS)))
19805 return NewOp;
19806 }
19807
19808 return SDValue();
19809}
19810
19813 SelectionDAG &DAG = DCI.DAG;
19814 SDValue LHS = N->getOperand(0);
19815 SDValue RHS = N->getOperand(1);
19816 EVT VT = N->getValueType(0);
19817 SDLoc DL(N);
19818
19819 if (!N->getFlags().hasAllowReassociation())
19820 return SDValue();
19821
19822 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19823 auto ReassocComplex = [&](SDValue A, SDValue B) {
19824 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19825 return SDValue();
19826 unsigned Opc = A.getConstantOperandVal(0);
19827 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19828 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19829 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19830 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19831 return SDValue();
19832 SDValue VCMLA = DAG.getNode(
19833 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
19834 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
19835 A.getOperand(2), A.getOperand(3));
19836 VCMLA->setFlags(A->getFlags());
19837 return VCMLA;
19838 };
19839 if (SDValue R = ReassocComplex(LHS, RHS))
19840 return R;
19841 if (SDValue R = ReassocComplex(RHS, LHS))
19842 return R;
19843
19844 return SDValue();
19845}
19846
19847static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19848 switch (Opcode) {
19849 case ISD::STRICT_FADD:
19850 case ISD::FADD:
19851 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19852 case ISD::ADD:
19853 return VT == MVT::i64;
19854 default:
19855 return false;
19856 }
19857}
19858
19859static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19861
19863 if ((N.getOpcode() == ISD::SETCC) ||
19864 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19865 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
19866 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
19867 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
19868 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
19869 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
19870 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
19871 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
19872 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
19873 // get_active_lane_mask is lowered to a whilelo instruction.
19874 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
19875 return true;
19876
19877 return false;
19878}
19879
19880// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19881// ... into: "ptrue p, all" + PTEST
19882static SDValue
19885 const AArch64Subtarget *Subtarget) {
19886 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19887 // Make sure PTEST can be legalised with illegal types.
19888 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19889 return SDValue();
19890
19891 SDValue N0 = N->getOperand(0);
19892 EVT VT = N0.getValueType();
19893
19894 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19895 !isNullConstant(N->getOperand(1)))
19896 return SDValue();
19897
19898 // Restricted the DAG combine to only cases where we're extracting from a
19899 // flag-setting operation.
19900 if (!isPredicateCCSettingOp(N0))
19901 return SDValue();
19902
19903 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19904 SelectionDAG &DAG = DCI.DAG;
19905 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
19906 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
19907}
19908
19909// Materialize : Idx = (add (mul vscale, NumEls), -1)
19910// i1 = extract_vector_elt t37, Constant:i64<Idx>
19911// ... into: "ptrue p, all" + PTEST
19912static SDValue
19915 const AArch64Subtarget *Subtarget) {
19916 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19917 // Make sure PTEST is legal types.
19918 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19919 return SDValue();
19920
19921 SDValue N0 = N->getOperand(0);
19922 EVT OpVT = N0.getValueType();
19923
19924 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19925 return SDValue();
19926
19927 // Idx == (add (mul vscale, NumEls), -1)
19928 SDValue Idx = N->getOperand(1);
19929 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
19930 return SDValue();
19931
19932 SDValue VS = Idx.getOperand(0);
19933 if (VS.getOpcode() != ISD::VSCALE)
19934 return SDValue();
19935
19936 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19937 if (VS.getConstantOperandVal(0) != NumEls)
19938 return SDValue();
19939
19940 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19941 SelectionDAG &DAG = DCI.DAG;
19942 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
19943 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
19944}
19945
19946static SDValue
19948 const AArch64Subtarget *Subtarget) {
19949 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19950 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19951 return Res;
19952 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19953 return Res;
19954
19955 SelectionDAG &DAG = DCI.DAG;
19956 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19957
19958 EVT VT = N->getValueType(0);
19959 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19960 bool IsStrict = N0->isStrictFPOpcode();
19961
19962 // extract(dup x) -> x
19963 if (N0.getOpcode() == AArch64ISD::DUP)
19964 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
19965 : N0.getOperand(0);
19966
19967 // Rewrite for pairwise fadd pattern
19968 // (f32 (extract_vector_elt
19969 // (fadd (vXf32 Other)
19970 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19971 // ->
19972 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19973 // (extract_vector_elt (vXf32 Other) 1))
19974 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19975 // we can only do this when it's used only by the extract_vector_elt.
19976 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
19977 (!IsStrict || N0.hasOneUse())) {
19978 SDLoc DL(N0);
19979 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
19980 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
19981
19982 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
19983 SDValue Other = N00;
19984
19985 // And handle the commutative case.
19986 if (!Shuffle) {
19987 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
19988 Other = N01;
19989 }
19990
19991 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
19992 Other == Shuffle->getOperand(0)) {
19993 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19994 DAG.getConstant(0, DL, MVT::i64));
19995 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19996 DAG.getConstant(1, DL, MVT::i64));
19997 if (!IsStrict)
19998 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
19999
20000 // For strict_fadd we need uses of the final extract_vector to be replaced
20001 // with the strict_fadd, but we also need uses of the chain output of the
20002 // original strict_fadd to use the chain output of the new strict_fadd as
20003 // otherwise it may not be deleted.
20004 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20005 {VT, MVT::Other},
20006 {N0->getOperand(0), Extract1, Extract2});
20007 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20008 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20009 return SDValue(N, 0);
20010 }
20011 }
20012
20013 return SDValue();
20014}
20015
20018 SelectionDAG &DAG) {
20019 SDLoc dl(N);
20020 EVT VT = N->getValueType(0);
20021 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20022 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20023
20024 if (VT.isScalableVector())
20025 return SDValue();
20026
20027 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20028 N1Opc == ISD::TRUNCATE) {
20029 SDValue N00 = N0->getOperand(0);
20030 SDValue N10 = N1->getOperand(0);
20031 EVT N00VT = N00.getValueType();
20032 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20033
20034 // Optimize concat_vectors of truncated vectors, where the intermediate
20035 // type is illegal, to avoid said illegality, e.g.,
20036 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20037 // (v2i16 (truncate (v2i64)))))
20038 // ->
20039 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20040 // (v4i32 (bitcast (v2i64))),
20041 // <0, 2, 4, 6>)))
20042 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20043 // on both input and result type, so we might generate worse code.
20044 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20045 if (N00VT == N10.getValueType() &&
20046 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20047 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20048 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20050 for (size_t i = 0; i < Mask.size(); ++i)
20051 Mask[i] = i * 2;
20052 return DAG.getNode(ISD::TRUNCATE, dl, VT,
20053 DAG.getVectorShuffle(
20054 MidVT, dl,
20055 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
20056 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
20057 }
20058
20059 // Optimize two large shifts and a combine into a single combine and shift
20060 // For AArch64 architectures, sequences like the following:
20061 //
20062 // ushr v0.4s, v0.4s, #20
20063 // ushr v1.4s, v1.4s, #20
20064 // uzp1 v0.8h, v0.8h, v1.8h
20065 //
20066 // Can be optimized to:
20067 //
20068 // uzp2 v0.8h, v0.8h, v1.8h
20069 // ushr v0.8h, v0.8h, #4
20070 //
20071 // This optimization reduces instruction count.
20072 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20073 N00->getOperand(1) == N10->getOperand(1)) {
20074 SDValue N000 = N00->getOperand(0);
20075 SDValue N100 = N10->getOperand(0);
20076 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20077 N101ConstVal = N10->getConstantOperandVal(1),
20078 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20079
20080 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20081 N000 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N000);
20082 N100 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N100);
20083 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
20084 SDValue NewShiftConstant =
20085 DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32);
20086
20087 return DAG.getNode(AArch64ISD::VLSHR, dl, VT, Uzp, NewShiftConstant);
20088 }
20089 }
20090 }
20091
20092 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20093 N->getOperand(0).getValueType() == MVT::v2i16 ||
20094 N->getOperand(0).getValueType() == MVT::v2i8) {
20095 EVT SrcVT = N->getOperand(0).getValueType();
20096 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20097 // loads to prevent having to go through the v4i8 load legalization that
20098 // needs to extend each element into a larger type.
20099 if (N->getNumOperands() % 2 == 0 &&
20100 all_of(N->op_values(), [SrcVT](SDValue V) {
20101 if (V.getValueType() != SrcVT)
20102 return false;
20103 if (V.isUndef())
20104 return true;
20105 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20106 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20107 LD->getExtensionType() == ISD::NON_EXTLOAD;
20108 })) {
20109 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20110 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20112
20113 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20114 SDValue V = N->getOperand(i);
20115 if (V.isUndef())
20116 Ops.push_back(DAG.getUNDEF(FVT));
20117 else {
20118 LoadSDNode *LD = cast<LoadSDNode>(V);
20119 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
20120 LD->getBasePtr(), LD->getMemOperand());
20121 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20122 Ops.push_back(NewLoad);
20123 }
20124 }
20125 return DAG.getBitcast(N->getValueType(0),
20126 DAG.getBuildVector(NVT, dl, Ops));
20127 }
20128 }
20129
20130 // Canonicalise concat_vectors to replace concatenations of truncated nots
20131 // with nots of concatenated truncates. This in some cases allows for multiple
20132 // redundant negations to be eliminated.
20133 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20134 // (v4i16 (truncate (not (v4i32)))))
20135 // ->
20136 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20137 // (v4i16 (truncate (v4i32)))))
20138 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20139 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20140 N->isOnlyUserOf(N1.getNode())) {
20141 auto isBitwiseVectorNegate = [](SDValue V) {
20142 return V->getOpcode() == ISD::XOR &&
20143 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20144 };
20145 SDValue N00 = N0->getOperand(0);
20146 SDValue N10 = N1->getOperand(0);
20147 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20148 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20149 return DAG.getNOT(
20150 dl,
20151 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20152 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
20153 N00->getOperand(0)),
20154 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
20155 N10->getOperand(0))),
20156 VT);
20157 }
20158 }
20159
20160 // Wait till after everything is legalized to try this. That way we have
20161 // legal vector types and such.
20162 if (DCI.isBeforeLegalizeOps())
20163 return SDValue();
20164
20165 // Optimise concat_vectors of two identical binops with a 128-bit destination
20166 // size, combine into an binop of two contacts of the source vectors. eg:
20167 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20168 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20169 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
20170 N1->hasOneUse()) {
20171 SDValue N00 = N0->getOperand(0);
20172 SDValue N01 = N0->getOperand(1);
20173 SDValue N10 = N1->getOperand(0);
20174 SDValue N11 = N1->getOperand(1);
20175
20176 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20177 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
20178 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
20179 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
20180 }
20181 }
20182
20183 auto IsRSHRN = [](SDValue Shr) {
20184 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20185 return false;
20186 SDValue Op = Shr.getOperand(0);
20187 EVT VT = Op.getValueType();
20188 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20189 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20190 return false;
20191
20192 APInt Imm;
20193 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20194 Imm = APInt(VT.getScalarSizeInBits(),
20195 Op.getOperand(1).getConstantOperandVal(0)
20196 << Op.getOperand(1).getConstantOperandVal(1));
20197 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20198 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20199 Imm = APInt(VT.getScalarSizeInBits(),
20200 Op.getOperand(1).getConstantOperandVal(0));
20201 else
20202 return false;
20203
20204 if (Imm != 1ULL << (ShtAmt - 1))
20205 return false;
20206 return true;
20207 };
20208
20209 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20210 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20211 ((IsRSHRN(N1) &&
20213 N1.isUndef())) {
20214 SDValue X = N0.getOperand(0).getOperand(0);
20215 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20216 : N1.getOperand(0).getOperand(0);
20217 EVT BVT =
20218 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20219 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
20220 SDValue Add = DAG.getNode(
20221 ISD::ADD, dl, BVT, CC,
20222 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
20223 SDValue Shr =
20224 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
20225 return Shr;
20226 }
20227
20228 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20229 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20230 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20231 N0.getOperand(1) == N1.getOperand(1)) {
20232 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
20233 DAG.getUNDEF(N0.getValueType()));
20234 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
20235 DAG.getUNDEF(N0.getValueType()));
20236 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
20237 }
20238
20239 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20240 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20241 // canonicalise to that.
20242 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20243 assert(VT.getScalarSizeInBits() == 64);
20244 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
20245 DAG.getConstant(0, dl, MVT::i64));
20246 }
20247
20248 // Canonicalise concat_vectors so that the right-hand vector has as few
20249 // bit-casts as possible before its real operation. The primary matching
20250 // destination for these operations will be the narrowing "2" instructions,
20251 // which depend on the operation being performed on this right-hand vector.
20252 // For example,
20253 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20254 // becomes
20255 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20256
20257 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20258 return SDValue();
20259 SDValue RHS = N1->getOperand(0);
20260 MVT RHSTy = RHS.getValueType().getSimpleVT();
20261 // If the RHS is not a vector, this is not the pattern we're looking for.
20262 if (!RHSTy.isVector())
20263 return SDValue();
20264
20265 LLVM_DEBUG(
20266 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20267
20268 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20269 RHSTy.getVectorNumElements() * 2);
20270 return DAG.getNode(ISD::BITCAST, dl, VT,
20271 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
20272 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
20273 RHS));
20274}
20275
20276static SDValue
20278 SelectionDAG &DAG) {
20279 if (DCI.isBeforeLegalizeOps())
20280 return SDValue();
20281
20282 EVT VT = N->getValueType(0);
20283 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20284 return SDValue();
20285
20286 SDValue V = N->getOperand(0);
20287
20288 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20289 // blocks this combine because the non-const case requires custom lowering.
20290 //
20291 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20292 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20293 if (isa<ConstantSDNode>(V.getOperand(0)))
20294 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20295
20296 return SDValue();
20297}
20298
20299static SDValue
20301 SelectionDAG &DAG) {
20302 SDLoc DL(N);
20303 SDValue Vec = N->getOperand(0);
20304 SDValue SubVec = N->getOperand(1);
20305 uint64_t IdxVal = N->getConstantOperandVal(2);
20306 EVT VecVT = Vec.getValueType();
20307 EVT SubVT = SubVec.getValueType();
20308
20309 // Only do this for legal fixed vector types.
20310 if (!VecVT.isFixedLengthVector() ||
20311 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20312 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20313 return SDValue();
20314
20315 // Ignore widening patterns.
20316 if (IdxVal == 0 && Vec.isUndef())
20317 return SDValue();
20318
20319 // Subvector must be half the width and an "aligned" insertion.
20320 unsigned NumSubElts = SubVT.getVectorNumElements();
20321 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20322 (IdxVal != 0 && IdxVal != NumSubElts))
20323 return SDValue();
20324
20325 // Fold insert_subvector -> concat_vectors
20326 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20327 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20328 SDValue Lo, Hi;
20329 if (IdxVal == 0) {
20330 Lo = SubVec;
20331 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20332 DAG.getVectorIdxConstant(NumSubElts, DL));
20333 } else {
20334 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20335 DAG.getVectorIdxConstant(0, DL));
20336 Hi = SubVec;
20337 }
20338 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20339}
20340
20343 SelectionDAG &DAG) {
20344 // Wait until after everything is legalized to try this. That way we have
20345 // legal vector types and such.
20346 if (DCI.isBeforeLegalizeOps())
20347 return SDValue();
20348 // Transform a scalar conversion of a value from a lane extract into a
20349 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20350 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20351 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20352 //
20353 // The second form interacts better with instruction selection and the
20354 // register allocator to avoid cross-class register copies that aren't
20355 // coalescable due to a lane reference.
20356
20357 // Check the operand and see if it originates from a lane extract.
20358 SDValue Op1 = N->getOperand(1);
20360 return SDValue();
20361
20362 // Yep, no additional predication needed. Perform the transform.
20363 SDValue IID = N->getOperand(0);
20364 SDValue Shift = N->getOperand(2);
20365 SDValue Vec = Op1.getOperand(0);
20366 SDValue Lane = Op1.getOperand(1);
20367 EVT ResTy = N->getValueType(0);
20368 EVT VecResTy;
20369 SDLoc DL(N);
20370
20371 // The vector width should be 128 bits by the time we get here, even
20372 // if it started as 64 bits (the extract_vector handling will have
20373 // done so). Bail if it is not.
20374 if (Vec.getValueSizeInBits() != 128)
20375 return SDValue();
20376
20377 if (Vec.getValueType() == MVT::v4i32)
20378 VecResTy = MVT::v4f32;
20379 else if (Vec.getValueType() == MVT::v2i64)
20380 VecResTy = MVT::v2f64;
20381 else
20382 return SDValue();
20383
20384 SDValue Convert =
20385 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20386 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20387}
20388
20389// AArch64 high-vector "long" operations are formed by performing the non-high
20390// version on an extract_subvector of each operand which gets the high half:
20391//
20392// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20393//
20394// However, there are cases which don't have an extract_high explicitly, but
20395// have another operation that can be made compatible with one for free. For
20396// example:
20397//
20398// (dupv64 scalar) --> (extract_high (dup128 scalar))
20399//
20400// This routine does the actual conversion of such DUPs, once outer routines
20401// have determined that everything else is in order.
20402// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20403// similarly here.
20405 MVT VT = N.getSimpleValueType();
20406 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20407 N.getConstantOperandVal(1) == 0)
20408 N = N.getOperand(0);
20409
20410 switch (N.getOpcode()) {
20411 case AArch64ISD::DUP:
20416 case AArch64ISD::MOVI:
20422 break;
20423 default:
20424 // FMOV could be supported, but isn't very useful, as it would only occur
20425 // if you passed a bitcast' floating point immediate to an eligible long
20426 // integer op (addl, smull, ...).
20427 return SDValue();
20428 }
20429
20430 if (!VT.is64BitVector())
20431 return SDValue();
20432
20433 SDLoc DL(N);
20434 unsigned NumElems = VT.getVectorNumElements();
20435 if (N.getValueType().is64BitVector()) {
20436 MVT ElementTy = VT.getVectorElementType();
20437 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20438 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20439 }
20440
20441 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20442 DAG.getConstant(NumElems, DL, MVT::i64));
20443}
20444
20446 if (N.getOpcode() == ISD::BITCAST)
20447 N = N.getOperand(0);
20448 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20449 return false;
20450 if (N.getOperand(0).getValueType().isScalableVector())
20451 return false;
20452 return N.getConstantOperandAPInt(1) ==
20453 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20454}
20455
20456/// Helper structure to keep track of ISD::SET_CC operands.
20461};
20462
20463/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20465 const SDValue *Cmp;
20467};
20468
20469/// Helper structure to keep track of SetCC information.
20473};
20474
20475/// Helper structure to be able to read SetCC information. If set to
20476/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20477/// GenericSetCCInfo.
20481};
20482
20483/// Check whether or not \p Op is a SET_CC operation, either a generic or
20484/// an
20485/// AArch64 lowered one.
20486/// \p SetCCInfo is filled accordingly.
20487/// \post SetCCInfo is meanginfull only when this function returns true.
20488/// \return True when Op is a kind of SET_CC operation.
20490 // If this is a setcc, this is straight forward.
20491 if (Op.getOpcode() == ISD::SETCC) {
20492 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20493 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20494 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20495 SetCCInfo.IsAArch64 = false;
20496 return true;
20497 }
20498 // Otherwise, check if this is a matching csel instruction.
20499 // In other words:
20500 // - csel 1, 0, cc
20501 // - csel 0, 1, !cc
20502 if (Op.getOpcode() != AArch64ISD::CSEL)
20503 return false;
20504 // Set the information about the operands.
20505 // TODO: we want the operands of the Cmp not the csel
20506 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20507 SetCCInfo.IsAArch64 = true;
20508 SetCCInfo.Info.AArch64.CC =
20509 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20510
20511 // Check that the operands matches the constraints:
20512 // (1) Both operands must be constants.
20513 // (2) One must be 1 and the other must be 0.
20514 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20515 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20516
20517 // Check (1).
20518 if (!TValue || !FValue)
20519 return false;
20520
20521 // Check (2).
20522 if (!TValue->isOne()) {
20523 // Update the comparison when we are interested in !cc.
20524 std::swap(TValue, FValue);
20525 SetCCInfo.Info.AArch64.CC =
20527 }
20528 return TValue->isOne() && FValue->isZero();
20529}
20530
20531// Returns true if Op is setcc or zext of setcc.
20532static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20533 if (