LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
848
850 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
853 } else {
856 }
859
860 // Generate outline atomics library calls only if LSE was not specified for
861 // subtarget
862 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
888#define LCALLNAMES(A, B, N) \
889 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
890 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
891 setLibcallName(A##N##_REL, #B #N "_rel"); \
892 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
893#define LCALLNAME4(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
896#define LCALLNAME5(A, B) \
897 LCALLNAMES(A, B, 1) \
898 LCALLNAMES(A, B, 2) \
899 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
900 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
903 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
904 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
905 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
906#undef LCALLNAMES
907#undef LCALLNAME4
908#undef LCALLNAME5
909 }
910
911 if (Subtarget->hasLSE128()) {
912 // Custom lowering because i128 is not legal. Must be replaced by 2x64
913 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
917 }
918
919 // 128-bit loads and stores can be done without expanding
922
923 // Aligned 128-bit loads and stores are single-copy atomic according to the
924 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
925 if (Subtarget->hasLSE2()) {
928 }
929
930 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
931 // custom lowering, as there are no un-paired non-temporal stores and
932 // legalization will break up 256 bit inputs.
934 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
935 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
936 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
941
942 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
943 // custom lowering, as there are no un-paired non-temporal loads legalization
944 // will break up 256 bit inputs.
945 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
946 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
947 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
948 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
949 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
950 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
951 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
952 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
953
954 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
956
957 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
958 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
959 // Issue __sincos_stret if available.
962 } else {
965 }
966
967 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
968 // MSVCRT doesn't have powi; fall back to pow
969 setLibcallName(RTLIB::POWI_F32, nullptr);
970 setLibcallName(RTLIB::POWI_F64, nullptr);
971 }
972
973 // Make floating-point constants legal for the large code model, so they don't
974 // become loads from the constant pool.
975 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
978 }
979
980 // AArch64 does not have floating-point extending loads, i1 sign-extending
981 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
982 for (MVT VT : MVT::fp_valuetypes()) {
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
985 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
986 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
987 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
988 }
989 for (MVT VT : MVT::integer_valuetypes())
991
992 for (MVT WideVT : MVT::fp_valuetypes()) {
993 for (MVT NarrowVT : MVT::fp_valuetypes()) {
994 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
995 setTruncStoreAction(WideVT, NarrowVT, Expand);
996 }
997 }
998 }
999
1000 if (Subtarget->hasFPARMv8()) {
1004 }
1005
1006 // Indexed loads and stores are supported.
1007 for (unsigned im = (unsigned)ISD::PRE_INC;
1009 setIndexedLoadAction(im, MVT::i8, Legal);
1010 setIndexedLoadAction(im, MVT::i16, Legal);
1011 setIndexedLoadAction(im, MVT::i32, Legal);
1012 setIndexedLoadAction(im, MVT::i64, Legal);
1013 setIndexedLoadAction(im, MVT::f64, Legal);
1014 setIndexedLoadAction(im, MVT::f32, Legal);
1015 setIndexedLoadAction(im, MVT::f16, Legal);
1016 setIndexedLoadAction(im, MVT::bf16, Legal);
1017 setIndexedStoreAction(im, MVT::i8, Legal);
1018 setIndexedStoreAction(im, MVT::i16, Legal);
1019 setIndexedStoreAction(im, MVT::i32, Legal);
1020 setIndexedStoreAction(im, MVT::i64, Legal);
1021 setIndexedStoreAction(im, MVT::f64, Legal);
1022 setIndexedStoreAction(im, MVT::f32, Legal);
1023 setIndexedStoreAction(im, MVT::f16, Legal);
1024 setIndexedStoreAction(im, MVT::bf16, Legal);
1025 }
1026
1027 // Trap.
1028 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1031
1032 // We combine OR nodes for bitfield operations.
1034 // Try to create BICs for vector ANDs.
1036
1037 // Vector add and sub nodes may conceal a high-half opportunity.
1038 // Also, try to fold ADD into CSINC/CSINV..
1041
1044
1045 // Try and combine setcc with csel
1047
1049
1056
1058
1060
1062
1066
1068
1070
1072
1074
1078
1080
1081 // In case of strict alignment, avoid an excessive number of byte wide stores.
1084 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1085
1089 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1090
1093
1096 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1097
1099
1101
1102 EnableExtLdPromotion = true;
1103
1104 // Set required alignment.
1106 // Set preferred alignments.
1107
1108 // Don't align loops on Windows. The SEH unwind info generation needs to
1109 // know the exact length of functions before the alignments have been
1110 // expanded.
1111 if (!Subtarget->isTargetWindows())
1115
1116 // Only change the limit for entries in a jump table if specified by
1117 // the sub target, but not at the command line.
1118 unsigned MaxJT = STI.getMaximumJumpTableSize();
1119 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1121
1123
1125
1127
1128 if (Subtarget->hasNEON()) {
1129 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1130 // silliness like this:
1131 for (auto Op :
1149 setOperationAction(Op, MVT::v1f64, Expand);
1150
1151 for (auto Op :
1156 setOperationAction(Op, MVT::v1i64, Expand);
1157
1158 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1159 // elements smaller than i32, so promote the input to i32 first.
1160 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1161 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1162
1163 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1164 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1165 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1168 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1170
1171 if (Subtarget->hasFullFP16()) {
1174
1183 } else {
1184 // when AArch64 doesn't have fullfp16 support, promote the input
1185 // to i32 first.
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1188 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1191 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1192 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1193 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1194 }
1195
1196 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1197 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1204 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1209 }
1210
1211 // Custom handling for some quad-vector types to detect MULL.
1212 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1215 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1216 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1217 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1218
1219 // Saturates
1220 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1221 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1226 }
1227
1228 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1229 MVT::v4i32}) {
1236 }
1237
1238 // Vector reductions
1239 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1240 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1241 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1246
1248 }
1249 }
1250 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1251 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1260 }
1265
1267 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1268 // Likewise, narrowing and extending vector loads/stores aren't handled
1269 // directly.
1272
1273 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1276 } else {
1279 }
1282
1285
1286 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1287 setTruncStoreAction(VT, InnerVT, Expand);
1288 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1289 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1290 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1291 }
1292 }
1293
1294 // AArch64 has implementations of a lot of rounding-like FP operations.
1295 for (auto Op :
1300 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1302 if (Subtarget->hasFullFP16())
1303 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1305 }
1306
1307 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1308
1313
1317
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1320 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1321 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1322 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1323 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1324
1325 // ADDP custom lowering
1326 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1328 // FADDP custom lowering
1329 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1331 }
1332
1333 if (Subtarget->hasSME()) {
1335 }
1336
1337 // FIXME: Move lowering for more nodes here if those are common between
1338 // SVE and SME.
1339 if (Subtarget->hasSVEorSME()) {
1340 for (auto VT :
1341 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1346 }
1347 }
1348
1349 if (Subtarget->hasSVEorSME()) {
1350 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1393
1399
1408
1413
1414 if (!Subtarget->isLittleEndian())
1416
1417 if (Subtarget->hasSVE2orSME())
1418 // For SLI/SRI.
1420 }
1421
1422 // Illegal unpacked integer vector types.
1423 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1426 }
1427
1428 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1429 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1430 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1432
1433 for (auto VT :
1434 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1435 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1437
1438 for (auto VT :
1439 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1447
1451
1452 // There are no legal MVT::nxv16f## based types.
1453 if (VT != MVT::nxv16i1) {
1456 }
1457 }
1458
1459 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1460 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1461 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1462 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1467 }
1468
1469 // Firstly, exclude all scalable vector extending loads/truncating stores,
1470 // include both integer and floating scalable vector.
1472 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1473 setTruncStoreAction(VT, InnerVT, Expand);
1474 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1475 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1476 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1477 }
1478 }
1479
1480 // Then, selectively enable those which we directly support.
1481 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1482 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1483 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1484 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1485 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1486 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1487 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1488 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1489 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1490 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1491 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1492 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1493 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1494 }
1495
1496 // SVE supports truncating stores of 64 and 128-bit vectors
1497 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1498 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1499 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1500 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1501 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1502
1503 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1504 MVT::nxv4f32, MVT::nxv2f64}) {
1540 if (Subtarget->isSVEAvailable())
1545
1559
1571
1572 if (!Subtarget->isLittleEndian())
1574 }
1575
1576 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1583
1584 if (!Subtarget->isLittleEndian())
1586 }
1587
1590
1591 // NEON doesn't support integer divides, but SVE does
1592 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1593 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1596 }
1597
1598 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1599 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1600 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1601
1602 if (Subtarget->isSVEAvailable()) {
1603 // NEON doesn't support across-vector reductions, but SVE does.
1604 for (auto VT :
1605 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1607 }
1608
1609 // NOTE: Currently this has to happen after computeRegisterProperties rather
1610 // than the preferred option of combining it with the addRegisterClass call.
1611 if (Subtarget->useSVEForFixedLengthVectors()) {
1614 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1615 addTypeForFixedLengthSVE(VT);
1616 }
1619 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1620 addTypeForFixedLengthSVE(VT);
1621 }
1622
1623 // 64bit results can mean a bigger than NEON input.
1624 for (auto VT : {MVT::v8i8, MVT::v4i16})
1627
1628 // 128bit results imply a bigger than NEON input.
1629 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1631 for (auto VT : {MVT::v8f16, MVT::v4f32})
1633
1634 // These operations are not supported on NEON but SVE can do them.
1636 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1637 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1638 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1639 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1640 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1641 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1642 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1643 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1644 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1645 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1646 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1647 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1648 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1649 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1650 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1655
1656 // Int operations with no NEON support.
1657 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1658 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1666 }
1667
1668
1669 // Use SVE for vectors with more than 2 elements.
1670 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1672 }
1673
1674 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1675 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1676 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1677 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1678
1680
1681 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1683 }
1684
1685 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1686 // Only required for llvm.aarch64.mops.memset.tag
1688 }
1689
1691
1692 if (Subtarget->hasSVE()) {
1697 }
1698
1699 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1700
1701 IsStrictFPEnabled = true;
1703
1704 if (Subtarget->isWindowsArm64EC()) {
1705 // FIXME: are there intrinsics we need to exclude from this?
1706 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1707 auto code = static_cast<RTLIB::Libcall>(i);
1708 auto libcallName = getLibcallName(code);
1709 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1710 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1711 }
1712 }
1713 }
1714}
1715
1716void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1717 assert(VT.isVector() && "VT should be a vector type");
1718
1719 if (VT.isFloatingPoint()) {
1721 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1722 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1723 }
1724
1725 // Mark vector float intrinsics as expand.
1726 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1736 }
1737
1738 // But we do support custom-lowering for FCOPYSIGN.
1739 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1740 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1741 VT == MVT::v8f16) &&
1742 Subtarget->hasFullFP16()))
1744
1757
1761 for (MVT InnerVT : MVT::all_valuetypes())
1762 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1763
1764 // CNT supports only B element sizes, then use UADDLP to widen.
1765 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1767
1773
1774 for (unsigned Opcode :
1777 setOperationAction(Opcode, VT, Custom);
1778
1779 if (!VT.isFloatingPoint())
1781
1782 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1783 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1784 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1785 setOperationAction(Opcode, VT, Legal);
1786
1787 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1788 // NEON types.
1789 if (VT.isFloatingPoint() &&
1790 VT.getVectorElementType() != MVT::bf16 &&
1791 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1792 for (unsigned Opcode :
1798 setOperationAction(Opcode, VT, Legal);
1799
1800 // Strict fp extend and trunc are legal
1801 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1803 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1805
1806 // FIXME: We could potentially make use of the vector comparison instructions
1807 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1808 // complications:
1809 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1810 // so we would need to expand when the condition code doesn't match the
1811 // kind of comparison.
1812 // * Some kinds of comparison require more than one FCMXY instruction so
1813 // would need to be expanded instead.
1814 // * The lowering of the non-strict versions involves target-specific ISD
1815 // nodes so we would likely need to add strict versions of all of them and
1816 // handle them appropriately.
1819
1820 if (Subtarget->isLittleEndian()) {
1821 for (unsigned im = (unsigned)ISD::PRE_INC;
1825 }
1826 }
1827
1828 if (Subtarget->hasD128()) {
1831 }
1832}
1833
1835 EVT OpVT) const {
1836 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1837 if (!Subtarget->hasSVE())
1838 return true;
1839
1840 // We can only support legal predicate result types. We can use the SVE
1841 // whilelo instruction for generating fixed-width predicates too.
1842 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1843 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1844 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1845 return true;
1846
1847 // The whilelo instruction only works with i32 or i64 scalar inputs.
1848 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1849 return true;
1850
1851 return false;
1852}
1853
1855 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1856}
1857
1858void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1859 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1860
1861 // By default everything must be expanded.
1862 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1864
1865 if (VT.isFloatingPoint()) {
1875 }
1876
1878 VT == MVT::v1f64 ? Expand : Custom;
1879
1880 // Mark integer truncating stores/extending loads as having custom lowering
1881 if (VT.isInteger()) {
1882 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1883 while (InnerVT != VT) {
1884 setTruncStoreAction(VT, InnerVT, Default);
1885 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1886 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1887 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1888 InnerVT = InnerVT.changeVectorElementType(
1889 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1890 }
1891 }
1892
1893 // Mark floating-point truncating stores/extending loads as having custom
1894 // lowering
1895 if (VT.isFloatingPoint()) {
1896 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1897 while (InnerVT != VT) {
1898 setTruncStoreAction(VT, InnerVT, Custom);
1899 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1900 InnerVT = InnerVT.changeVectorElementType(
1902 }
1903 }
1904
1905 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
1906 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1907
1908 // Lower fixed length vector operations to scalable equivalents.
1913 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
1948 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
1949 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
1951 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
1970 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
1996}
1997
1998void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1999 addRegisterClass(VT, &AArch64::FPR64RegClass);
2000 addTypeForNEON(VT);
2001}
2002
2003void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2004 addRegisterClass(VT, &AArch64::FPR128RegClass);
2005 addTypeForNEON(VT);
2006}
2007
2009 LLVMContext &C, EVT VT) const {
2010 if (!VT.isVector())
2011 return MVT::i32;
2012 if (VT.isScalableVector())
2013 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2015}
2016
2017// isIntImmediate - This method tests to see if the node is a constant
2018// operand. If so Imm will receive the value.
2019static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2020 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2021 Imm = C->getZExtValue();
2022 return true;
2023 }
2024 return false;
2025}
2026
2027// isOpcWithIntImmediate - This method tests to see if the node is a specific
2028// opcode and that it has a immediate integer right operand.
2029// If so Imm will receive the value.
2030static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2031 uint64_t &Imm) {
2032 return N->getOpcode() == Opc &&
2033 isIntImmediate(N->getOperand(1).getNode(), Imm);
2034}
2035
2036static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2037 const APInt &Demanded,
2039 unsigned NewOpc) {
2040 uint64_t OldImm = Imm, NewImm, Enc;
2041 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2042
2043 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2044 // bimm64.
2045 if (Imm == 0 || Imm == Mask ||
2047 return false;
2048
2049 unsigned EltSize = Size;
2050 uint64_t DemandedBits = Demanded.getZExtValue();
2051
2052 // Clear bits that are not demanded.
2053 Imm &= DemandedBits;
2054
2055 while (true) {
2056 // The goal here is to set the non-demanded bits in a way that minimizes
2057 // the number of switching between 0 and 1. In order to achieve this goal,
2058 // we set the non-demanded bits to the value of the preceding demanded bits.
2059 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2060 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2061 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2062 // The final result is 0b11000011.
2063 uint64_t NonDemandedBits = ~DemandedBits;
2064 uint64_t InvertedImm = ~Imm & DemandedBits;
2065 uint64_t RotatedImm =
2066 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2067 NonDemandedBits;
2068 uint64_t Sum = RotatedImm + NonDemandedBits;
2069 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2070 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2071 NewImm = (Imm | Ones) & Mask;
2072
2073 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2074 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2075 // we halve the element size and continue the search.
2076 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2077 break;
2078
2079 // We cannot shrink the element size any further if it is 2-bits.
2080 if (EltSize == 2)
2081 return false;
2082
2083 EltSize /= 2;
2084 Mask >>= EltSize;
2085 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2086
2087 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2088 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2089 return false;
2090
2091 // Merge the upper and lower halves of Imm and DemandedBits.
2092 Imm |= Hi;
2093 DemandedBits |= DemandedBitsHi;
2094 }
2095
2096 ++NumOptimizedImms;
2097
2098 // Replicate the element across the register width.
2099 while (EltSize < Size) {
2100 NewImm |= NewImm << EltSize;
2101 EltSize *= 2;
2102 }
2103
2104 (void)OldImm;
2105 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2106 "demanded bits should never be altered");
2107 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2108
2109 // Create the new constant immediate node.
2110 EVT VT = Op.getValueType();
2111 SDLoc DL(Op);
2112 SDValue New;
2113
2114 // If the new constant immediate is all-zeros or all-ones, let the target
2115 // independent DAG combine optimize this node.
2116 if (NewImm == 0 || NewImm == OrigMask) {
2117 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2118 TLO.DAG.getConstant(NewImm, DL, VT));
2119 // Otherwise, create a machine node so that target independent DAG combine
2120 // doesn't undo this optimization.
2121 } else {
2123 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2124 New = SDValue(
2125 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2126 }
2127
2128 return TLO.CombineTo(Op, New);
2129}
2130
2132 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2133 TargetLoweringOpt &TLO) const {
2134 // Delay this optimization to as late as possible.
2135 if (!TLO.LegalOps)
2136 return false;
2137
2139 return false;
2140
2141 EVT VT = Op.getValueType();
2142 if (VT.isVector())
2143 return false;
2144
2145 unsigned Size = VT.getSizeInBits();
2146 assert((Size == 32 || Size == 64) &&
2147 "i32 or i64 is expected after legalization.");
2148
2149 // Exit early if we demand all bits.
2150 if (DemandedBits.popcount() == Size)
2151 return false;
2152
2153 unsigned NewOpc;
2154 switch (Op.getOpcode()) {
2155 default:
2156 return false;
2157 case ISD::AND:
2158 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2159 break;
2160 case ISD::OR:
2161 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2162 break;
2163 case ISD::XOR:
2164 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2165 break;
2166 }
2167 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2168 if (!C)
2169 return false;
2170 uint64_t Imm = C->getZExtValue();
2171 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2172}
2173
2174/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2175/// Mask are known to be either zero or one and return them Known.
2177 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2178 const SelectionDAG &DAG, unsigned Depth) const {
2179 switch (Op.getOpcode()) {
2180 default:
2181 break;
2182 case AArch64ISD::DUP: {
2183 SDValue SrcOp = Op.getOperand(0);
2184 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2185 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2186 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2187 "Expected DUP implicit truncation");
2188 Known = Known.trunc(Op.getScalarValueSizeInBits());
2189 }
2190 break;
2191 }
2192 case AArch64ISD::CSEL: {
2193 KnownBits Known2;
2194 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2195 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2196 Known = Known.intersectWith(Known2);
2197 break;
2198 }
2199 case AArch64ISD::BICi: {
2200 // Compute the bit cleared value.
2201 uint64_t Mask =
2202 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2203 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2204 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2205 break;
2206 }
2207 case AArch64ISD::VLSHR: {
2208 KnownBits Known2;
2209 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2210 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2211 Known = KnownBits::lshr(Known, Known2);
2212 break;
2213 }
2214 case AArch64ISD::VASHR: {
2215 KnownBits Known2;
2216 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2217 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2218 Known = KnownBits::ashr(Known, Known2);
2219 break;
2220 }
2221 case AArch64ISD::VSHL: {
2222 KnownBits Known2;
2223 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2224 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2225 Known = KnownBits::shl(Known, Known2);
2226 break;
2227 }
2228 case AArch64ISD::MOVI: {
2230 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2231 break;
2232 }
2234 case AArch64ISD::ADDlow: {
2235 if (!Subtarget->isTargetILP32())
2236 break;
2237 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2238 Known.Zero = APInt::getHighBitsSet(64, 32);
2239 break;
2240 }
2242 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2243 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2244 break;
2245 }
2247 Intrinsic::ID IntID =
2248 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2249 switch (IntID) {
2250 default: return;
2251 case Intrinsic::aarch64_ldaxr:
2252 case Intrinsic::aarch64_ldxr: {
2253 unsigned BitWidth = Known.getBitWidth();
2254 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2255 unsigned MemBits = VT.getScalarSizeInBits();
2256 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2257 return;
2258 }
2259 }
2260 break;
2261 }
2263 case ISD::INTRINSIC_VOID: {
2264 unsigned IntNo = Op.getConstantOperandVal(0);
2265 switch (IntNo) {
2266 default:
2267 break;
2268 case Intrinsic::aarch64_neon_uaddlv: {
2269 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2270 unsigned BitWidth = Known.getBitWidth();
2271 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2272 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2273 assert(BitWidth >= Bound && "Unexpected width!");
2275 Known.Zero |= Mask;
2276 }
2277 break;
2278 }
2279 case Intrinsic::aarch64_neon_umaxv:
2280 case Intrinsic::aarch64_neon_uminv: {
2281 // Figure out the datatype of the vector operand. The UMINV instruction
2282 // will zero extend the result, so we can mark as known zero all the
2283 // bits larger than the element datatype. 32-bit or larget doesn't need
2284 // this as those are legal types and will be handled by isel directly.
2285 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2286 unsigned BitWidth = Known.getBitWidth();
2287 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2288 assert(BitWidth >= 8 && "Unexpected width!");
2290 Known.Zero |= Mask;
2291 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2292 assert(BitWidth >= 16 && "Unexpected width!");
2294 Known.Zero |= Mask;
2295 }
2296 break;
2297 } break;
2298 }
2299 }
2300 }
2301}
2302
2304 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2305 unsigned Depth) const {
2306 EVT VT = Op.getValueType();
2307 unsigned VTBits = VT.getScalarSizeInBits();
2308 unsigned Opcode = Op.getOpcode();
2309 switch (Opcode) {
2310 case AArch64ISD::CMEQ:
2311 case AArch64ISD::CMGE:
2312 case AArch64ISD::CMGT:
2313 case AArch64ISD::CMHI:
2314 case AArch64ISD::CMHS:
2315 case AArch64ISD::FCMEQ:
2316 case AArch64ISD::FCMGE:
2317 case AArch64ISD::FCMGT:
2318 case AArch64ISD::CMEQz:
2319 case AArch64ISD::CMGEz:
2320 case AArch64ISD::CMGTz:
2321 case AArch64ISD::CMLEz:
2322 case AArch64ISD::CMLTz:
2323 case AArch64ISD::FCMEQz:
2324 case AArch64ISD::FCMGEz:
2325 case AArch64ISD::FCMGTz:
2326 case AArch64ISD::FCMLEz:
2327 case AArch64ISD::FCMLTz:
2328 // Compares return either 0 or all-ones
2329 return VTBits;
2330 }
2331
2332 return 1;
2333}
2334
2336 EVT) const {
2337 return MVT::i64;
2338}
2339
2341 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2342 unsigned *Fast) const {
2343 if (Subtarget->requiresStrictAlign())
2344 return false;
2345
2346 if (Fast) {
2347 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2348 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2349 // See comments in performSTORECombine() for more details about
2350 // these conditions.
2351
2352 // Code that uses clang vector extensions can mark that it
2353 // wants unaligned accesses to be treated as fast by
2354 // underspecifying alignment to be 1 or 2.
2355 Alignment <= 2 ||
2356
2357 // Disregard v2i64. Memcpy lowering produces those and splitting
2358 // them regresses performance on micro-benchmarks and olden/bh.
2359 VT == MVT::v2i64;
2360 }
2361 return true;
2362}
2363
2364// Same as above but handling LLTs instead.
2366 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2367 unsigned *Fast) const {
2368 if (Subtarget->requiresStrictAlign())
2369 return false;
2370
2371 if (Fast) {
2372 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2373 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2374 Ty.getSizeInBytes() != 16 ||
2375 // See comments in performSTORECombine() for more details about
2376 // these conditions.
2377
2378 // Code that uses clang vector extensions can mark that it
2379 // wants unaligned accesses to be treated as fast by
2380 // underspecifying alignment to be 1 or 2.
2381 Alignment <= 2 ||
2382
2383 // Disregard v2i64. Memcpy lowering produces those and splitting
2384 // them regresses performance on micro-benchmarks and olden/bh.
2385 Ty == LLT::fixed_vector(2, 64);
2386 }
2387 return true;
2388}
2389
2390FastISel *
2392 const TargetLibraryInfo *libInfo) const {
2393 return AArch64::createFastISel(funcInfo, libInfo);
2394}
2395
2396const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2397#define MAKE_CASE(V) \
2398 case V: \
2399 return #V;
2400 switch ((AArch64ISD::NodeType)Opcode) {
2402 break;
2719 }
2720#undef MAKE_CASE
2721 return nullptr;
2722}
2723
2726 MachineBasicBlock *MBB) const {
2727 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2728 // phi node:
2729
2730 // OrigBB:
2731 // [... previous instrs leading to comparison ...]
2732 // b.ne TrueBB
2733 // b EndBB
2734 // TrueBB:
2735 // ; Fallthrough
2736 // EndBB:
2737 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2738
2739 MachineFunction *MF = MBB->getParent();
2740 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2741 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2742 DebugLoc DL = MI.getDebugLoc();
2744
2745 Register DestReg = MI.getOperand(0).getReg();
2746 Register IfTrueReg = MI.getOperand(1).getReg();
2747 Register IfFalseReg = MI.getOperand(2).getReg();
2748 unsigned CondCode = MI.getOperand(3).getImm();
2749 bool NZCVKilled = MI.getOperand(4).isKill();
2750
2751 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2752 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2753 MF->insert(It, TrueBB);
2754 MF->insert(It, EndBB);
2755
2756 // Transfer rest of current basic-block to EndBB
2757 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2758 MBB->end());
2760
2761 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2762 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2763 MBB->addSuccessor(TrueBB);
2764 MBB->addSuccessor(EndBB);
2765
2766 // TrueBB falls through to the end.
2767 TrueBB->addSuccessor(EndBB);
2768
2769 if (!NZCVKilled) {
2770 TrueBB->addLiveIn(AArch64::NZCV);
2771 EndBB->addLiveIn(AArch64::NZCV);
2772 }
2773
2774 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2775 .addReg(IfTrueReg)
2776 .addMBB(TrueBB)
2777 .addReg(IfFalseReg)
2778 .addMBB(MBB);
2779
2780 MI.eraseFromParent();
2781 return EndBB;
2782}
2783
2785 MachineInstr &MI, MachineBasicBlock *BB) const {
2787 BB->getParent()->getFunction().getPersonalityFn())) &&
2788 "SEH does not use catchret!");
2789 return BB;
2790}
2791
2794 MachineBasicBlock *MBB) const {
2795 MachineFunction &MF = *MBB->getParent();
2796 MachineBasicBlock::iterator MBBI = MI.getIterator();
2798 const AArch64InstrInfo &TII =
2799 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2800 Register TargetReg = MI.getOperand(0).getReg();
2802 TII.probedStackAlloc(MBBI, TargetReg, false);
2803
2804 MI.eraseFromParent();
2805 return NextInst->getParent();
2806}
2807
2809AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2811 MachineBasicBlock *BB) const {
2812 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2813 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2814
2815 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2816 MIB.add(MI.getOperand(1)); // slice index register
2817 MIB.add(MI.getOperand(2)); // slice index offset
2818 MIB.add(MI.getOperand(3)); // pg
2819 MIB.add(MI.getOperand(4)); // base
2820 MIB.add(MI.getOperand(5)); // offset
2821
2822 MI.eraseFromParent(); // The pseudo is gone now.
2823 return BB;
2824}
2825
2828 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2830 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2831
2832 MIB.addReg(AArch64::ZA, RegState::Define);
2833 MIB.add(MI.getOperand(0)); // Vector select register
2834 MIB.add(MI.getOperand(1)); // Vector select offset
2835 MIB.add(MI.getOperand(2)); // Base
2836 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2837
2838 MI.eraseFromParent(); // The pseudo is gone now.
2839 return BB;
2840}
2841
2844 unsigned Opcode,
2845 bool Op0IsDef) const {
2846 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2848
2849 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2850 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2851 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2852 MIB.add(MI.getOperand(I));
2853
2854 MI.eraseFromParent(); // The pseudo is gone now.
2855 return BB;
2856}
2857
2859AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2861 MachineBasicBlock *BB, bool HasTile) const {
2862 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2863 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2864 unsigned StartIdx = 0;
2865
2866 if (HasTile) {
2867 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2868 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2869 StartIdx = 1;
2870 } else
2871 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2872
2873 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2874 MIB.add(MI.getOperand(I));
2875
2876 MI.eraseFromParent(); // The pseudo is gone now.
2877 return BB;
2878}
2879
2882 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2884 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2885 MIB.add(MI.getOperand(0)); // Mask
2886
2887 unsigned Mask = MI.getOperand(0).getImm();
2888 for (unsigned I = 0; I < 8; I++) {
2889 if (Mask & (1 << I))
2890 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2891 }
2892
2893 MI.eraseFromParent(); // The pseudo is gone now.
2894 return BB;
2895}
2896
2898 MachineInstr &MI, MachineBasicBlock *BB) const {
2899
2900 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2901 if (SMEOrigInstr != -1) {
2902 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2903 uint64_t SMEMatrixType =
2904 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2905 switch (SMEMatrixType) {
2907 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2909 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2911 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2913 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2915 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2917 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2918 }
2919 }
2920
2921 switch (MI.getOpcode()) {
2922 default:
2923#ifndef NDEBUG
2924 MI.dump();
2925#endif
2926 llvm_unreachable("Unexpected instruction for custom inserter!");
2927
2928 case AArch64::F128CSEL:
2929 return EmitF128CSEL(MI, BB);
2930 case TargetOpcode::STATEPOINT:
2931 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2932 // while bl call instruction (where statepoint will be lowered at the end)
2933 // has implicit def. This def is early-clobber as it will be set at
2934 // the moment of the call and earlier than any use is read.
2935 // Add this implicit dead def here as a workaround.
2936 MI.addOperand(*MI.getMF(),
2938 AArch64::LR, /*isDef*/ true,
2939 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2940 /*isUndef*/ false, /*isEarlyClobber*/ true));
2941 [[fallthrough]];
2942 case TargetOpcode::STACKMAP:
2943 case TargetOpcode::PATCHPOINT:
2944 return emitPatchPoint(MI, BB);
2945
2946 case TargetOpcode::PATCHABLE_EVENT_CALL:
2947 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2948 return BB;
2949
2950 case AArch64::CATCHRET:
2951 return EmitLoweredCatchRet(MI, BB);
2952
2953 case AArch64::PROBED_STACKALLOC_DYN:
2954 return EmitDynamicProbedAlloc(MI, BB);
2955
2956 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2957 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2958 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2959 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2960 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2961 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2962 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2963 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2964 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2965 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2966 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2967 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2968 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2969 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2970 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2971 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2972 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2973 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2974 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2975 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2976 case AArch64::LDR_ZA_PSEUDO:
2977 return EmitFill(MI, BB);
2978 case AArch64::LDR_TX_PSEUDO:
2979 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2980 case AArch64::STR_TX_PSEUDO:
2981 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2982 case AArch64::ZERO_M_PSEUDO:
2983 return EmitZero(MI, BB);
2984 case AArch64::ZERO_T_PSEUDO:
2985 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2986 }
2987}
2988
2989//===----------------------------------------------------------------------===//
2990// AArch64 Lowering private implementation.
2991//===----------------------------------------------------------------------===//
2992
2993//===----------------------------------------------------------------------===//
2994// Lowering Code
2995//===----------------------------------------------------------------------===//
2996
2997// Forward declarations of SVE fixed length lowering helpers
3002 SelectionDAG &DAG);
3005 EVT VT);
3006
3007/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3008static bool isZerosVector(const SDNode *N) {
3009 // Look through a bit convert.
3010 while (N->getOpcode() == ISD::BITCAST)
3011 N = N->getOperand(0).getNode();
3012
3014 return true;
3015
3016 if (N->getOpcode() != AArch64ISD::DUP)
3017 return false;
3018
3019 auto Opnd0 = N->getOperand(0);
3020 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3021}
3022
3023/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3024/// CC
3026 switch (CC) {
3027 default:
3028 llvm_unreachable("Unknown condition code!");
3029 case ISD::SETNE:
3030 return AArch64CC::NE;
3031 case ISD::SETEQ:
3032 return AArch64CC::EQ;
3033 case ISD::SETGT:
3034 return AArch64CC::GT;
3035 case ISD::SETGE:
3036 return AArch64CC::GE;
3037 case ISD::SETLT:
3038 return AArch64CC::LT;
3039 case ISD::SETLE:
3040 return AArch64CC::LE;
3041 case ISD::SETUGT:
3042 return AArch64CC::HI;
3043 case ISD::SETUGE:
3044 return AArch64CC::HS;
3045 case ISD::SETULT:
3046 return AArch64CC::LO;
3047 case ISD::SETULE:
3048 return AArch64CC::LS;
3049 }
3050}
3051
3052/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3054 AArch64CC::CondCode &CondCode,
3055 AArch64CC::CondCode &CondCode2) {
3056 CondCode2 = AArch64CC::AL;
3057 switch (CC) {
3058 default:
3059 llvm_unreachable("Unknown FP condition!");
3060 case ISD::SETEQ:
3061 case ISD::SETOEQ:
3062 CondCode = AArch64CC::EQ;
3063 break;
3064 case ISD::SETGT:
3065 case ISD::SETOGT:
3066 CondCode = AArch64CC::GT;
3067 break;
3068 case ISD::SETGE:
3069 case ISD::SETOGE:
3070 CondCode = AArch64CC::GE;
3071 break;
3072 case ISD::SETOLT:
3073 CondCode = AArch64CC::MI;
3074 break;
3075 case ISD::SETOLE:
3076 CondCode = AArch64CC::LS;
3077 break;
3078 case ISD::SETONE:
3079 CondCode = AArch64CC::MI;
3080 CondCode2 = AArch64CC::GT;
3081 break;
3082 case ISD::SETO:
3083 CondCode = AArch64CC::VC;
3084 break;
3085 case ISD::SETUO:
3086 CondCode = AArch64CC::VS;
3087 break;
3088 case ISD::SETUEQ:
3089 CondCode = AArch64CC::EQ;
3090 CondCode2 = AArch64CC::VS;
3091 break;
3092 case ISD::SETUGT:
3093 CondCode = AArch64CC::HI;
3094 break;
3095 case ISD::SETUGE:
3096 CondCode = AArch64CC::PL;
3097 break;
3098 case ISD::SETLT:
3099 case ISD::SETULT:
3100 CondCode = AArch64CC::LT;
3101 break;
3102 case ISD::SETLE:
3103 case ISD::SETULE:
3104 CondCode = AArch64CC::LE;
3105 break;
3106 case ISD::SETNE:
3107 case ISD::SETUNE:
3108 CondCode = AArch64CC::NE;
3109 break;
3110 }
3111}
3112
3113/// Convert a DAG fp condition code to an AArch64 CC.
3114/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3115/// should be AND'ed instead of OR'ed.
3117 AArch64CC::CondCode &CondCode,
3118 AArch64CC::CondCode &CondCode2) {
3119 CondCode2 = AArch64CC::AL;
3120 switch (CC) {
3121 default:
3122 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3123 assert(CondCode2 == AArch64CC::AL);
3124 break;
3125 case ISD::SETONE:
3126 // (a one b)
3127 // == ((a olt b) || (a ogt b))
3128 // == ((a ord b) && (a une b))
3129 CondCode = AArch64CC::VC;
3130 CondCode2 = AArch64CC::NE;
3131 break;
3132 case ISD::SETUEQ:
3133 // (a ueq b)
3134 // == ((a uno b) || (a oeq b))
3135 // == ((a ule b) && (a uge b))
3136 CondCode = AArch64CC::PL;
3137 CondCode2 = AArch64CC::LE;
3138 break;
3139 }
3140}
3141
3142/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3143/// CC usable with the vector instructions. Fewer operations are available
3144/// without a real NZCV register, so we have to use less efficient combinations
3145/// to get the same effect.
3147 AArch64CC::CondCode &CondCode,
3148 AArch64CC::CondCode &CondCode2,
3149 bool &Invert) {
3150 Invert = false;
3151 switch (CC) {
3152 default:
3153 // Mostly the scalar mappings work fine.
3154 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3155 break;
3156 case ISD::SETUO:
3157 Invert = true;
3158 [[fallthrough]];
3159 case ISD::SETO:
3160 CondCode = AArch64CC::MI;
3161 CondCode2 = AArch64CC::GE;
3162 break;
3163 case ISD::SETUEQ:
3164 case ISD::SETULT:
3165 case ISD::SETULE:
3166 case ISD::SETUGT:
3167 case ISD::SETUGE:
3168 // All of the compare-mask comparisons are ordered, but we can switch
3169 // between the two by a double inversion. E.g. ULE == !OGT.
3170 Invert = true;
3171 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3172 CondCode, CondCode2);
3173 break;
3174 }
3175}
3176
3178 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3179 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3180 LLVM_DEBUG(dbgs() << "Is imm " << C
3181 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3182 return IsLegal;
3183}
3184
3185// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3186// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3187// can be set differently by this operation. It comes down to whether
3188// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3189// everything is fine. If not then the optimization is wrong. Thus general
3190// comparisons are only valid if op2 != 0.
3191//
3192// So, finally, the only LLVM-native comparisons that don't mention C and V
3193// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3194// the absence of information about op2.
3196 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3197 (CC == ISD::SETEQ || CC == ISD::SETNE);
3198}
3199
3201 SelectionDAG &DAG, SDValue Chain,
3202 bool IsSignaling) {
3203 EVT VT = LHS.getValueType();
3204 assert(VT != MVT::f128);
3205
3206 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3207
3208 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3209 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3210 {Chain, LHS});
3211 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3212 {LHS.getValue(1), RHS});
3213 Chain = RHS.getValue(1);
3214 VT = MVT::f32;
3215 }
3216 unsigned Opcode =
3218 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3219}
3220
3222 const SDLoc &dl, SelectionDAG &DAG) {
3223 EVT VT = LHS.getValueType();
3224 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3225
3226 if (VT.isFloatingPoint()) {
3227 assert(VT != MVT::f128);
3228 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3229 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3230 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3231 VT = MVT::f32;
3232 }
3233 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3234 }
3235
3236 // The CMP instruction is just an alias for SUBS, and representing it as
3237 // SUBS means that it's possible to get CSE with subtract operations.
3238 // A later phase can perform the optimization of setting the destination
3239 // register to WZR/XZR if it ends up being unused.
3240 unsigned Opcode = AArch64ISD::SUBS;
3241
3242 if (isCMN(RHS, CC)) {
3243 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3244 Opcode = AArch64ISD::ADDS;
3245 RHS = RHS.getOperand(1);
3246 } else if (isCMN(LHS, CC)) {
3247 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3248 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3249 Opcode = AArch64ISD::ADDS;
3250 LHS = LHS.getOperand(1);
3251 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3252 if (LHS.getOpcode() == ISD::AND) {
3253 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3254 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3255 // of the signed comparisons.
3256 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3257 DAG.getVTList(VT, MVT_CC),
3258 LHS.getOperand(0),
3259 LHS.getOperand(1));
3260 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3261 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3262 return ANDSNode.getValue(1);
3263 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3264 // Use result of ANDS
3265 return LHS.getValue(1);
3266 }
3267 }
3268
3269 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3270 .getValue(1);
3271}
3272
3273/// \defgroup AArch64CCMP CMP;CCMP matching
3274///
3275/// These functions deal with the formation of CMP;CCMP;... sequences.
3276/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3277/// a comparison. They set the NZCV flags to a predefined value if their
3278/// predicate is false. This allows to express arbitrary conjunctions, for
3279/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3280/// expressed as:
3281/// cmp A
3282/// ccmp B, inv(CB), CA
3283/// check for CB flags
3284///
3285/// This naturally lets us implement chains of AND operations with SETCC
3286/// operands. And we can even implement some other situations by transforming
3287/// them:
3288/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3289/// negating the flags used in a CCMP/FCCMP operations.
3290/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3291/// by negating the flags we test for afterwards. i.e.
3292/// NEG (CMP CCMP CCCMP ...) can be implemented.
3293/// - Note that we can only ever negate all previously processed results.
3294/// What we can not implement by flipping the flags to test is a negation
3295/// of two sub-trees (because the negation affects all sub-trees emitted so
3296/// far, so the 2nd sub-tree we emit would also affect the first).
3297/// With those tools we can implement some OR operations:
3298/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3299/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3300/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3301/// elimination rules from earlier to implement the whole thing as a
3302/// CCMP/FCCMP chain.
3303///
3304/// As complete example:
3305/// or (or (setCA (cmp A)) (setCB (cmp B)))
3306/// (and (setCC (cmp C)) (setCD (cmp D)))"
3307/// can be reassociated to:
3308/// or (and (setCC (cmp C)) setCD (cmp D))
3309// (or (setCA (cmp A)) (setCB (cmp B)))
3310/// can be transformed to:
3311/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3312/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3313/// which can be implemented as:
3314/// cmp C
3315/// ccmp D, inv(CD), CC
3316/// ccmp A, CA, inv(CD)
3317/// ccmp B, CB, inv(CA)
3318/// check for CB flags
3319///
3320/// A counterexample is "or (and A B) (and C D)" which translates to
3321/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3322/// can only implement 1 of the inner (not) operations, but not both!
3323/// @{
3324
3325/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3327 ISD::CondCode CC, SDValue CCOp,
3328 AArch64CC::CondCode Predicate,
3329 AArch64CC::CondCode OutCC,
3330 const SDLoc &DL, SelectionDAG &DAG) {
3331 unsigned Opcode = 0;
3332 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3333
3334 if (LHS.getValueType().isFloatingPoint()) {
3335 assert(LHS.getValueType() != MVT::f128);
3336 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3337 LHS.getValueType() == MVT::bf16) {
3338 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3339 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3340 }
3341 Opcode = AArch64ISD::FCCMP;
3342 } else if (RHS.getOpcode() == ISD::SUB) {
3343 SDValue SubOp0 = RHS.getOperand(0);
3344 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3345 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3346 Opcode = AArch64ISD::CCMN;
3347 RHS = RHS.getOperand(1);
3348 }
3349 }
3350 if (Opcode == 0)
3351 Opcode = AArch64ISD::CCMP;
3352
3353 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3355 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3356 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3357 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3358}
3359
3360/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3361/// expressed as a conjunction. See \ref AArch64CCMP.
3362/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3363/// changing the conditions on the SETCC tests.
3364/// (this means we can call emitConjunctionRec() with
3365/// Negate==true on this sub-tree)
3366/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3367/// cannot do the negation naturally. We are required to
3368/// emit the subtree first in this case.
3369/// \param WillNegate Is true if are called when the result of this
3370/// subexpression must be negated. This happens when the
3371/// outer expression is an OR. We can use this fact to know
3372/// that we have a double negation (or (or ...) ...) that
3373/// can be implemented for free.
3374static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3375 bool &MustBeFirst, bool WillNegate,
3376 unsigned Depth = 0) {
3377 if (!Val.hasOneUse())
3378 return false;
3379 unsigned Opcode = Val->getOpcode();
3380 if (Opcode == ISD::SETCC) {
3381 if (Val->getOperand(0).getValueType() == MVT::f128)
3382 return false;
3383 CanNegate = true;
3384 MustBeFirst = false;
3385 return true;
3386 }
3387 // Protect against exponential runtime and stack overflow.
3388 if (Depth > 6)
3389 return false;
3390 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3391 bool IsOR = Opcode == ISD::OR;
3392 SDValue O0 = Val->getOperand(0);
3393 SDValue O1 = Val->getOperand(1);
3394 bool CanNegateL;
3395 bool MustBeFirstL;
3396 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3397 return false;
3398 bool CanNegateR;
3399 bool MustBeFirstR;
3400 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3401 return false;
3402
3403 if (MustBeFirstL && MustBeFirstR)
3404 return false;
3405
3406 if (IsOR) {
3407 // For an OR expression we need to be able to naturally negate at least
3408 // one side or we cannot do the transformation at all.
3409 if (!CanNegateL && !CanNegateR)
3410 return false;
3411 // If we the result of the OR will be negated and we can naturally negate
3412 // the leafs, then this sub-tree as a whole negates naturally.
3413 CanNegate = WillNegate && CanNegateL && CanNegateR;
3414 // If we cannot naturally negate the whole sub-tree, then this must be
3415 // emitted first.
3416 MustBeFirst = !CanNegate;
3417 } else {
3418 assert(Opcode == ISD::AND && "Must be OR or AND");
3419 // We cannot naturally negate an AND operation.
3420 CanNegate = false;
3421 MustBeFirst = MustBeFirstL || MustBeFirstR;
3422 }
3423 return true;
3424 }
3425 return false;
3426}
3427
3428/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3429/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3430/// Tries to transform the given i1 producing node @p Val to a series compare
3431/// and conditional compare operations. @returns an NZCV flags producing node
3432/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3433/// transformation was not possible.
3434/// \p Negate is true if we want this sub-tree being negated just by changing
3435/// SETCC conditions.
3437 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3438 AArch64CC::CondCode Predicate) {
3439 // We're at a tree leaf, produce a conditional comparison operation.
3440 unsigned Opcode = Val->getOpcode();
3441 if (Opcode == ISD::SETCC) {
3442 SDValue LHS = Val->getOperand(0);
3443 SDValue RHS = Val->getOperand(1);
3444 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3445 bool isInteger = LHS.getValueType().isInteger();
3446 if (Negate)
3447 CC = getSetCCInverse(CC, LHS.getValueType());
3448 SDLoc DL(Val);
3449 // Determine OutCC and handle FP special case.
3450 if (isInteger) {
3451 OutCC = changeIntCCToAArch64CC(CC);
3452 } else {
3453 assert(LHS.getValueType().isFloatingPoint());
3454 AArch64CC::CondCode ExtraCC;
3455 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3456 // Some floating point conditions can't be tested with a single condition
3457 // code. Construct an additional comparison in this case.
3458 if (ExtraCC != AArch64CC::AL) {
3459 SDValue ExtraCmp;
3460 if (!CCOp.getNode())
3461 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3462 else
3463 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3464 ExtraCC, DL, DAG);
3465 CCOp = ExtraCmp;
3466 Predicate = ExtraCC;
3467 }
3468 }
3469
3470 // Produce a normal comparison if we are first in the chain
3471 if (!CCOp)
3472 return emitComparison(LHS, RHS, CC, DL, DAG);
3473 // Otherwise produce a ccmp.
3474 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3475 DAG);
3476 }
3477 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3478
3479 bool IsOR = Opcode == ISD::OR;
3480
3481 SDValue LHS = Val->getOperand(0);
3482 bool CanNegateL;
3483 bool MustBeFirstL;
3484 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3485 assert(ValidL && "Valid conjunction/disjunction tree");
3486 (void)ValidL;
3487
3488 SDValue RHS = Val->getOperand(1);
3489 bool CanNegateR;
3490 bool MustBeFirstR;
3491 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3492 assert(ValidR && "Valid conjunction/disjunction tree");
3493 (void)ValidR;
3494
3495 // Swap sub-tree that must come first to the right side.
3496 if (MustBeFirstL) {
3497 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3498 std::swap(LHS, RHS);
3499 std::swap(CanNegateL, CanNegateR);
3500 std::swap(MustBeFirstL, MustBeFirstR);
3501 }
3502
3503 bool NegateR;
3504 bool NegateAfterR;
3505 bool NegateL;
3506 bool NegateAfterAll;
3507 if (Opcode == ISD::OR) {
3508 // Swap the sub-tree that we can negate naturally to the left.
3509 if (!CanNegateL) {
3510 assert(CanNegateR && "at least one side must be negatable");
3511 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3512 assert(!Negate);
3513 std::swap(LHS, RHS);
3514 NegateR = false;
3515 NegateAfterR = true;
3516 } else {
3517 // Negate the left sub-tree if possible, otherwise negate the result.
3518 NegateR = CanNegateR;
3519 NegateAfterR = !CanNegateR;
3520 }
3521 NegateL = true;
3522 NegateAfterAll = !Negate;
3523 } else {
3524 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3525 assert(!Negate && "Valid conjunction/disjunction tree");
3526
3527 NegateL = false;
3528 NegateR = false;
3529 NegateAfterR = false;
3530 NegateAfterAll = false;
3531 }
3532
3533 // Emit sub-trees.
3534 AArch64CC::CondCode RHSCC;
3535 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3536 if (NegateAfterR)
3537 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3538 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3539 if (NegateAfterAll)
3540 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3541 return CmpL;
3542}
3543
3544/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3545/// In some cases this is even possible with OR operations in the expression.
3546/// See \ref AArch64CCMP.
3547/// \see emitConjunctionRec().
3549 AArch64CC::CondCode &OutCC) {
3550 bool DummyCanNegate;
3551 bool DummyMustBeFirst;
3552 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3553 return SDValue();
3554
3555 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3556}
3557
3558/// @}
3559
3560/// Returns how profitable it is to fold a comparison's operand's shift and/or
3561/// extension operations.
3563 auto isSupportedExtend = [&](SDValue V) {
3564 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3565 return true;
3566
3567 if (V.getOpcode() == ISD::AND)
3568 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3569 uint64_t Mask = MaskCst->getZExtValue();
3570 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3571 }
3572
3573 return false;
3574 };
3575
3576 if (!Op.hasOneUse())
3577 return 0;
3578
3579 if (isSupportedExtend(Op))
3580 return 1;
3581
3582 unsigned Opc = Op.getOpcode();
3583 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3584 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3585 uint64_t Shift = ShiftCst->getZExtValue();
3586 if (isSupportedExtend(Op.getOperand(0)))
3587 return (Shift <= 4) ? 2 : 1;
3588 EVT VT = Op.getValueType();
3589 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3590 return 1;
3591 }
3592
3593 return 0;
3594}
3595
3597 SDValue &AArch64cc, SelectionDAG &DAG,
3598 const SDLoc &dl) {
3599 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3600 EVT VT = RHS.getValueType();
3601 uint64_t C = RHSC->getZExtValue();
3602 if (!isLegalArithImmed(C)) {
3603 // Constant does not fit, try adjusting it by one?
3604 switch (CC) {
3605 default:
3606 break;
3607 case ISD::SETLT:
3608 case ISD::SETGE:
3609 if ((VT == MVT::i32 && C != 0x80000000 &&
3610 isLegalArithImmed((uint32_t)(C - 1))) ||
3611 (VT == MVT::i64 && C != 0x80000000ULL &&
3612 isLegalArithImmed(C - 1ULL))) {
3614 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3615 RHS = DAG.getConstant(C, dl, VT);
3616 }
3617 break;
3618 case ISD::SETULT:
3619 case ISD::SETUGE:
3620 if ((VT == MVT::i32 && C != 0 &&
3621 isLegalArithImmed((uint32_t)(C - 1))) ||
3622 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3624 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3625 RHS = DAG.getConstant(C, dl, VT);
3626 }
3627 break;
3628 case ISD::SETLE:
3629 case ISD::SETGT:
3630 if ((VT == MVT::i32 && C != INT32_MAX &&
3631 isLegalArithImmed((uint32_t)(C + 1))) ||
3632 (VT == MVT::i64 && C != INT64_MAX &&
3633 isLegalArithImmed(C + 1ULL))) {
3635 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3636 RHS = DAG.getConstant(C, dl, VT);
3637 }
3638 break;
3639 case ISD::SETULE:
3640 case ISD::SETUGT:
3641 if ((VT == MVT::i32 && C != UINT32_MAX &&
3642 isLegalArithImmed((uint32_t)(C + 1))) ||
3643 (VT == MVT::i64 && C != UINT64_MAX &&
3644 isLegalArithImmed(C + 1ULL))) {
3646 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3647 RHS = DAG.getConstant(C, dl, VT);
3648 }
3649 break;
3650 }
3651 }
3652 }
3653
3654 // Comparisons are canonicalized so that the RHS operand is simpler than the
3655 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3656 // can fold some shift+extend operations on the RHS operand, so swap the
3657 // operands if that can be done.
3658 //
3659 // For example:
3660 // lsl w13, w11, #1
3661 // cmp w13, w12
3662 // can be turned into:
3663 // cmp w12, w11, lsl #1
3664 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3665 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3666
3668 std::swap(LHS, RHS);
3670 }
3671 }
3672
3673 SDValue Cmp;
3674 AArch64CC::CondCode AArch64CC;
3675 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3676 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3677
3678 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3679 // For the i8 operand, the largest immediate is 255, so this can be easily
3680 // encoded in the compare instruction. For the i16 operand, however, the
3681 // largest immediate cannot be encoded in the compare.
3682 // Therefore, use a sign extending load and cmn to avoid materializing the
3683 // -1 constant. For example,
3684 // movz w1, #65535
3685 // ldrh w0, [x0, #0]
3686 // cmp w0, w1
3687 // >
3688 // ldrsh w0, [x0, #0]
3689 // cmn w0, #1
3690 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3691 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3692 // ensure both the LHS and RHS are truly zero extended and to make sure the
3693 // transformation is profitable.
3694 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3695 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3696 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3697 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3698 int16_t ValueofRHS = RHS->getAsZExtVal();
3699 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3700 SDValue SExt =
3701 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3702 DAG.getValueType(MVT::i16));
3703 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3704 RHS.getValueType()),
3705 CC, dl, DAG);
3706 AArch64CC = changeIntCCToAArch64CC(CC);
3707 }
3708 }
3709
3710 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3711 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3712 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3713 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3714 }
3715 }
3716 }
3717
3718 if (!Cmp) {
3719 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3720 AArch64CC = changeIntCCToAArch64CC(CC);
3721 }
3722 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3723 return Cmp;
3724}
3725
3726static std::pair<SDValue, SDValue>
3728 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3729 "Unsupported value type");
3730 SDValue Value, Overflow;
3731 SDLoc DL(Op);
3732 SDValue LHS = Op.getOperand(0);
3733 SDValue RHS = Op.getOperand(1);
3734 unsigned Opc = 0;
3735 switch (Op.getOpcode()) {
3736 default:
3737 llvm_unreachable("Unknown overflow instruction!");
3738 case ISD::SADDO:
3739 Opc = AArch64ISD::ADDS;
3740 CC = AArch64CC::VS;
3741 break;
3742 case ISD::UADDO:
3743 Opc = AArch64ISD::ADDS;
3744 CC = AArch64CC::HS;
3745 break;
3746 case ISD::SSUBO:
3747 Opc = AArch64ISD::SUBS;
3748 CC = AArch64CC::VS;
3749 break;
3750 case ISD::USUBO:
3751 Opc = AArch64ISD::SUBS;
3752 CC = AArch64CC::LO;
3753 break;
3754 // Multiply needs a little bit extra work.
3755 case ISD::SMULO:
3756 case ISD::UMULO: {
3757 CC = AArch64CC::NE;
3758 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3759 if (Op.getValueType() == MVT::i32) {
3760 // Extend to 64-bits, then perform a 64-bit multiply.
3761 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3762 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3763 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3764 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3765 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3766
3767 // Check that the result fits into a 32-bit integer.
3768 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3769 if (IsSigned) {
3770 // cmp xreg, wreg, sxtw
3771 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3772 Overflow =
3773 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3774 } else {
3775 // tst xreg, #0xffffffff00000000
3776 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3777 Overflow =
3778 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3779 }
3780 break;
3781 }
3782 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3783 // For the 64 bit multiply
3784 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3785 if (IsSigned) {
3786 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3787 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3788 DAG.getConstant(63, DL, MVT::i64));
3789 // It is important that LowerBits is last, otherwise the arithmetic
3790 // shift will not be folded into the compare (SUBS).
3791 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3792 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3793 .getValue(1);
3794 } else {
3795 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3796 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3797 Overflow =
3798 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3799 DAG.getConstant(0, DL, MVT::i64),
3800 UpperBits).getValue(1);
3801 }
3802 break;
3803 }
3804 } // switch (...)
3805
3806 if (Opc) {
3807 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3808
3809 // Emit the AArch64 operation with overflow check.
3810 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3811 Overflow = Value.getValue(1);
3812 }
3813 return std::make_pair(Value, Overflow);
3814}
3815
3816SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3817 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3818 !Subtarget->isNeonAvailable()))
3819 return LowerToScalableOp(Op, DAG);
3820
3821 SDValue Sel = Op.getOperand(0);
3822 SDValue Other = Op.getOperand(1);
3823 SDLoc dl(Sel);
3824
3825 // If the operand is an overflow checking operation, invert the condition
3826 // code and kill the Not operation. I.e., transform:
3827 // (xor (overflow_op_bool, 1))
3828 // -->
3829 // (csel 1, 0, invert(cc), overflow_op_bool)
3830 // ... which later gets transformed to just a cset instruction with an
3831 // inverted condition code, rather than a cset + eor sequence.
3833 // Only lower legal XALUO ops.
3835 return SDValue();
3836
3837 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3838 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3840 SDValue Value, Overflow;
3841 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3842 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3843 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3844 CCVal, Overflow);
3845 }
3846 // If neither operand is a SELECT_CC, give up.
3847 if (Sel.getOpcode() != ISD::SELECT_CC)
3848 std::swap(Sel, Other);
3849 if (Sel.getOpcode() != ISD::SELECT_CC)
3850 return Op;
3851
3852 // The folding we want to perform is:
3853 // (xor x, (select_cc a, b, cc, 0, -1) )
3854 // -->
3855 // (csel x, (xor x, -1), cc ...)
3856 //
3857 // The latter will get matched to a CSINV instruction.
3858
3859 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3860 SDValue LHS = Sel.getOperand(0);
3861 SDValue RHS = Sel.getOperand(1);
3862 SDValue TVal = Sel.getOperand(2);
3863 SDValue FVal = Sel.getOperand(3);
3864
3865 // FIXME: This could be generalized to non-integer comparisons.
3866 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3867 return Op;
3868
3869 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3870 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3871
3872 // The values aren't constants, this isn't the pattern we're looking for.
3873 if (!CFVal || !CTVal)
3874 return Op;
3875
3876 // We can commute the SELECT_CC by inverting the condition. This
3877 // might be needed to make this fit into a CSINV pattern.
3878 if (CTVal->isAllOnes() && CFVal->isZero()) {
3879 std::swap(TVal, FVal);
3880 std::swap(CTVal, CFVal);
3881 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3882 }
3883
3884 // If the constants line up, perform the transform!
3885 if (CTVal->isZero() && CFVal->isAllOnes()) {
3886 SDValue CCVal;
3887 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3888
3889 FVal = Other;
3890 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3891 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3892
3893 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3894 CCVal, Cmp);
3895 }
3896
3897 return Op;
3898}
3899
3900// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3901// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3902// sets 'C' bit to 0.
3904 SDLoc DL(Value);
3905 EVT VT = Value.getValueType();
3906 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3907 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3908 SDValue Cmp =
3909 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3910 return Cmp.getValue(1);
3911}
3912
3913// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3914// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3916 bool Invert) {
3917 assert(Glue.getResNo() == 1);
3918 SDLoc DL(Glue);
3919 SDValue Zero = DAG.getConstant(0, DL, VT);
3920 SDValue One = DAG.getConstant(1, DL, VT);
3921 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3922 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3923 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3924}
3925
3926// Value is 1 if 'V' bit of NZCV is 1, else 0
3928 assert(Glue.getResNo() == 1);
3929 SDLoc DL(Glue);
3930 SDValue Zero = DAG.getConstant(0, DL, VT);
3931 SDValue One = DAG.getConstant(1, DL, VT);
3932 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3933 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3934}
3935
3936// This lowering is inefficient, but it will get cleaned up by
3937// `foldOverflowCheck`
3939 unsigned Opcode, bool IsSigned) {
3940 EVT VT0 = Op.getValue(0).getValueType();
3941 EVT VT1 = Op.getValue(1).getValueType();
3942
3943 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3944 return SDValue();
3945
3946 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3947 SDValue OpLHS = Op.getOperand(0);
3948 SDValue OpRHS = Op.getOperand(1);
3949 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3950
3951 SDLoc DL(Op);
3952 SDVTList VTs = DAG.getVTList(VT0, VT1);
3953
3954 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3955 OpRHS, OpCarryIn);
3956
3957 SDValue OutFlag =
3958 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3959 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3960
3961 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3962}
3963
3965 // Let legalize expand this if it isn't a legal type yet.
3966 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3967 return SDValue();
3968
3969 SDLoc dl(Op);
3971 // The actual operation that sets the overflow or carry flag.
3972 SDValue Value, Overflow;
3973 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3974
3975 // We use 0 and 1 as false and true values.
3976 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3977 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3978
3979 // We use an inverted condition, because the conditional select is inverted
3980 // too. This will allow it to be selected to a single instruction:
3981 // CSINC Wd, WZR, WZR, invert(cond).
3982 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3983 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3984 CCVal, Overflow);
3985
3986 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3987 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3988}
3989
3990// Prefetch operands are:
3991// 1: Address to prefetch
3992// 2: bool isWrite
3993// 3: int locality (0 = no locality ... 3 = extreme locality)
3994// 4: bool isDataCache
3996 SDLoc DL(Op);
3997 unsigned IsWrite = Op.getConstantOperandVal(2);
3998 unsigned Locality = Op.getConstantOperandVal(3);
3999 unsigned IsData = Op.getConstantOperandVal(4);
4000
4001 bool IsStream = !Locality;
4002 // When the locality number is set
4003 if (Locality) {
4004 // The front-end should have filtered out the out-of-range values
4005 assert(Locality <= 3 && "Prefetch locality out-of-range");
4006 // The locality degree is the opposite of the cache speed.
4007 // Put the number the other way around.
4008 // The encoding starts at 0 for level 1
4009 Locality = 3 - Locality;
4010 }
4011
4012 // built the mask value encoding the expected behavior.
4013 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4014 (!IsData << 3) | // IsDataCache bit
4015 (Locality << 1) | // Cache level bits
4016 (unsigned)IsStream; // Stream bit
4017 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4018 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4019 Op.getOperand(1));
4020}
4021
4022SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4023 SelectionDAG &DAG) const {
4024 EVT VT = Op.getValueType();
4025 if (VT.isScalableVector())
4026 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4027
4028 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4029 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4030
4031 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4032 return SDValue();
4033}
4034
4035SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4036 SelectionDAG &DAG) const {
4037 EVT VT = Op.getValueType();
4038 if (VT.isScalableVector())
4039 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4040
4041 bool IsStrict = Op->isStrictFPOpcode();
4042 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4043 EVT SrcVT = SrcVal.getValueType();
4044 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4045
4046 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4047 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4048
4049 // Expand cases where the result type is BF16 but we don't have hardware
4050 // instructions to lower it.
4051 if (VT.getScalarType() == MVT::bf16 &&
4052 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4053 Subtarget->hasBF16())) {
4054 SDLoc dl(Op);
4055 SDValue Narrow = SrcVal;
4056 SDValue NaN;
4057 EVT I32 = SrcVT.changeElementType(MVT::i32);
4058 EVT F32 = SrcVT.changeElementType(MVT::f32);
4059 if (SrcVT.getScalarType() == MVT::f32) {
4060 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4061 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4062 if (!NeverSNaN) {
4063 // Set the quiet bit.
4064 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4065 DAG.getConstant(0x400000, dl, I32));
4066 }
4067 } else if (SrcVT.getScalarType() == MVT::f64) {
4068 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4069 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4070 } else {
4071 return SDValue();
4072 }
4073 if (!Trunc) {
4074 SDValue One = DAG.getConstant(1, dl, I32);
4075 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4076 DAG.getShiftAmountConstant(16, I32, dl));
4077 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4078 SDValue RoundingBias =
4079 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4080 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4081 }
4082
4083 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4084 // 0x80000000.
4085 if (NaN) {
4086 SDValue IsNaN = DAG.getSetCC(
4087 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4088 SrcVal, SrcVal, ISD::SETUO);
4089 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4090 }
4091
4092 // Now that we have rounded, shift the bits into position.
4093 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4094 DAG.getShiftAmountConstant(16, I32, dl));
4095 if (VT.isVector()) {
4096 EVT I16 = I32.changeVectorElementType(MVT::i16);
4097 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4098 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4099 }
4100 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4101 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4102 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4103 : Result;
4104 }
4105
4106 if (SrcVT != MVT::f128) {
4107 // Expand cases where the input is a vector bigger than NEON.
4109 return SDValue();
4110
4111 // It's legal except when f128 is involved
4112 return Op;
4113 }
4114
4115 return SDValue();
4116}
4117
4118SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4119 SelectionDAG &DAG) const {
4120 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4121 // Any additional optimization in this function should be recorded
4122 // in the cost tables.
4123 bool IsStrict = Op->isStrictFPOpcode();
4124 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4125 EVT VT = Op.getValueType();
4126
4127 if (VT.isScalableVector()) {
4128 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4131 return LowerToPredicatedOp(Op, DAG, Opcode);
4132 }
4133
4134 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4135 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4136 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4137
4138 unsigned NumElts = InVT.getVectorNumElements();
4139
4140 // f16 conversions are promoted to f32 when full fp16 is not supported.
4141 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4142 InVT.getVectorElementType() == MVT::bf16) {
4143 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4144 SDLoc dl(Op);
4145 if (IsStrict) {
4146 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4147 {Op.getOperand(0), Op.getOperand(1)});
4148 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4149 {Ext.getValue(1), Ext.getValue(0)});
4150 }
4151 return DAG.getNode(
4152 Op.getOpcode(), dl, Op.getValueType(),
4153 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4154 }
4155
4156 uint64_t VTSize = VT.getFixedSizeInBits();
4157 uint64_t InVTSize = InVT.getFixedSizeInBits();
4158 if (VTSize < InVTSize) {
4159 SDLoc dl(Op);
4160 if (IsStrict) {
4162 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4163 {Op.getOperand(0), Op.getOperand(1)});
4164 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4165 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4166 }
4167 SDValue Cv =
4168 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4169 Op.getOperand(0));
4170 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4171 }
4172
4173 if (VTSize > InVTSize) {
4174 SDLoc dl(Op);
4175 MVT ExtVT =
4178 if (IsStrict) {
4179 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4180 {Op.getOperand(0), Op.getOperand(1)});
4181 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4182 {Ext.getValue(1), Ext.getValue(0)});
4183 }
4184 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4185 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4186 }
4187
4188 // Use a scalar operation for conversions between single-element vectors of
4189 // the same size.
4190 if (NumElts == 1) {
4191 SDLoc dl(Op);
4192 SDValue Extract = DAG.getNode(
4194 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4195 EVT ScalarVT = VT.getScalarType();
4196 if (IsStrict)
4197 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4198 {Op.getOperand(0), Extract});
4199 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4200 }
4201
4202 // Type changing conversions are illegal.
4203 return Op;
4204}
4205
4206SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4207 SelectionDAG &DAG) const {
4208 bool IsStrict = Op->isStrictFPOpcode();
4209 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4210
4211 if (SrcVal.getValueType().isVector())
4212 return LowerVectorFP_TO_INT(Op, DAG);
4213
4214 // f16 conversions are promoted to f32 when full fp16 is not supported.
4215 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4216 SrcVal.getValueType() == MVT::bf16) {
4217 SDLoc dl(Op);
4218 if (IsStrict) {
4219 SDValue Ext =
4220 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4221 {Op.getOperand(0), SrcVal});
4222 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4223 {Ext.getValue(1), Ext.getValue(0)});
4224 }
4225 return DAG.getNode(
4226 Op.getOpcode(), dl, Op.getValueType(),
4227 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4228 }
4229
4230 if (SrcVal.getValueType() != MVT::f128) {
4231 // It's legal except when f128 is involved
4232 return Op;
4233 }
4234
4235 return SDValue();
4236}
4237
4238SDValue
4239AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4240 SelectionDAG &DAG) const {
4241 // AArch64 FP-to-int conversions saturate to the destination element size, so
4242 // we can lower common saturating conversions to simple instructions.
4243 SDValue SrcVal = Op.getOperand(0);
4244 EVT SrcVT = SrcVal.getValueType();
4245 EVT DstVT = Op.getValueType();
4246 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4247
4248 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4249 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4250 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4251 assert(SatWidth <= DstElementWidth &&
4252 "Saturation width cannot exceed result width");
4253
4254 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4255 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4256 // types, so this is hard to reach.
4257 if (DstVT.isScalableVector())
4258 return SDValue();
4259
4260 EVT SrcElementVT = SrcVT.getVectorElementType();
4261
4262 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4263 if ((SrcElementVT == MVT::f16 &&
4264 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4265 SrcElementVT == MVT::bf16) {
4266 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4267 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4268 SrcVT = F32VT;
4269 SrcElementVT = MVT::f32;
4270 SrcElementWidth = 32;
4271 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4272 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4273 return SDValue();
4274
4275 SDLoc DL(Op);
4276 // Cases that we can emit directly.
4277 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4278 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4279 DAG.getValueType(DstVT.getScalarType()));
4280
4281 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4282 // result. This is only valid if the legal cvt is larger than the saturate
4283 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4284 // (at least until sqxtn is selected).
4285 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4286 return SDValue();
4287
4288 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4289 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4290 DAG.getValueType(IntVT.getScalarType()));
4291 SDValue Sat;
4292 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4293 SDValue MinC = DAG.getConstant(
4294 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4295 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4296 SDValue MaxC = DAG.getConstant(
4297 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4298 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4299 } else {
4300 SDValue MinC = DAG.getConstant(
4301 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4302 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4303 }
4304
4305 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4306}
4307
4308SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4309 SelectionDAG &DAG) const {
4310 // AArch64 FP-to-int conversions saturate to the destination register size, so
4311 // we can lower common saturating conversions to simple instructions.
4312 SDValue SrcVal = Op.getOperand(0);
4313 EVT SrcVT = SrcVal.getValueType();
4314
4315 if (SrcVT.isVector())
4316 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4317
4318 EVT DstVT = Op.getValueType();
4319 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4320 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4321 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4322 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4323
4324 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4325 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4326 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4327 SrcVT = MVT::f32;
4328 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4329 SrcVT != MVT::bf16)
4330 return SDValue();
4331
4332 SDLoc DL(Op);
4333 // Cases that we can emit directly.
4334 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4335 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4336 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4337 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4338 DAG.getValueType(DstVT));
4339
4340 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4341 // result. This is only valid if the legal cvt is larger than the saturate
4342 // width.
4343 if (DstWidth < SatWidth)
4344 return SDValue();
4345
4346 SDValue NativeCvt =
4347 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4348 SDValue Sat;
4349 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4350 SDValue MinC = DAG.getConstant(
4351 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4352 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4353 SDValue MaxC = DAG.getConstant(
4354 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4355 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4356 } else {
4357 SDValue MinC = DAG.getConstant(
4358 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4359 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4360 }
4361
4362 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4363}
4364
4365SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4366 SelectionDAG &DAG) const {
4367 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4368 // Any additional optimization in this function should be recorded
4369 // in the cost tables.
4370 bool IsStrict = Op->isStrictFPOpcode();
4371 EVT VT = Op.getValueType();
4372 SDLoc dl(Op);
4373 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4374 EVT InVT = In.getValueType();
4375 unsigned Opc = Op.getOpcode();
4376 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4377
4378 if (VT.isScalableVector()) {
4379 if (InVT.getVectorElementType() == MVT::i1) {
4380 // We can't directly extend an SVE predicate; extend it first.
4381 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4382 EVT CastVT = getPromotedVTForPredicate(InVT);
4383 In = DAG.getNode(CastOpc, dl, CastVT, In);
4384 return DAG.getNode(Opc, dl, VT, In);
4385 }
4386
4387 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4389 return LowerToPredicatedOp(Op, DAG, Opcode);
4390 }
4391
4392 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4393 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4394 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4395
4396 // Promote bf16 conversions to f32.
4397 if (VT.getVectorElementType() == MVT::bf16) {
4398 EVT F32 = VT.changeElementType(MVT::f32);
4399 if (IsStrict) {
4400 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4401 {Op.getOperand(0), In});
4402 return DAG.getNode(
4403 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4404 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4405 }
4406 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4407 DAG.getNode(Op.getOpcode(), dl, F32, In),
4408 DAG.getIntPtrConstant(0, dl));
4409 }
4410
4411 uint64_t VTSize = VT.getFixedSizeInBits();
4412 uint64_t InVTSize = InVT.getFixedSizeInBits();
4413 if (VTSize < InVTSize) {
4414 MVT CastVT =
4416 InVT.getVectorNumElements());
4417 if (IsStrict) {
4418 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4419 {Op.getOperand(0), In});
4420 return DAG.getNode(
4421 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4422 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4423 }
4424 In = DAG.getNode(Opc, dl, CastVT, In);
4425 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4426 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4427 }
4428
4429 if (VTSize > InVTSize) {
4430 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4432 In = DAG.getNode(CastOpc, dl, CastVT, In);
4433 if (IsStrict)
4434 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4435 return DAG.getNode(Opc, dl, VT, In);
4436 }
4437
4438 // Use a scalar operation for conversions between single-element vectors of
4439 // the same size.
4440 if (VT.getVectorNumElements() == 1) {
4441 SDValue Extract = DAG.getNode(
4443 In, DAG.getConstant(0, dl, MVT::i64));
4444 EVT ScalarVT = VT.getScalarType();
4445 if (IsStrict)
4446 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4447 {Op.getOperand(0), Extract});
4448 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4449 }
4450
4451 return Op;
4452}
4453
4454SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4455 SelectionDAG &DAG) const {
4456 if (Op.getValueType().isVector())
4457 return LowerVectorINT_TO_FP(Op, DAG);
4458
4459 bool IsStrict = Op->isStrictFPOpcode();
4460 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4461
4462 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4463 Op->getOpcode() == ISD::SINT_TO_FP;
4464
4465 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4466 SDLoc dl(Op);
4467 if (IsStrict) {
4468 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4469 {Op.getOperand(0), SrcVal});
4470 return DAG.getNode(
4471 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4472 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4473 }
4474 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4475 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4476 DAG.getIntPtrConstant(0, dl));
4477 };
4478
4479 if (Op.getValueType() == MVT::bf16) {
4480 unsigned MaxWidth = IsSigned
4481 ? DAG.ComputeMaxSignificantBits(SrcVal)
4482 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4483 // bf16 conversions are promoted to f32 when converting from i16.
4484 if (MaxWidth <= 24) {
4485 return IntToFpViaPromotion(MVT::f32);
4486 }
4487
4488 // bf16 conversions are promoted to f64 when converting from i32.
4489 if (MaxWidth <= 53) {
4490 return IntToFpViaPromotion(MVT::f64);
4491 }
4492
4493 // We need to be careful about i64 -> bf16.
4494 // Consider an i32 22216703.
4495 // This number cannot be represented exactly as an f32 and so a itofp will
4496 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4497 // However, the correct bf16 was supposed to be 22151168.0
4498 // We need to use sticky rounding to get this correct.
4499 if (SrcVal.getValueType() == MVT::i64) {
4500 SDLoc DL(Op);
4501 // This algorithm is equivalent to the following:
4502 // uint64_t SrcHi = SrcVal & ~0xfffull;
4503 // uint64_t SrcLo = SrcVal & 0xfffull;
4504 // uint64_t Highest = SrcVal >> 53;
4505 // bool HasHighest = Highest != 0;
4506 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4507 // double Rounded = static_cast<double>(ToRound);
4508 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4509 // uint64_t HasLo = SrcLo != 0;
4510 // bool NeedsAdjustment = HasHighest & HasLo;
4511 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4512 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4513 // return static_cast<__bf16>(Adjusted);
4514 //
4515 // Essentially, what happens is that SrcVal either fits perfectly in a
4516 // double-precision value or it is too big. If it is sufficiently small,
4517 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4518 // ensure that u64 -> double has no rounding error by only using the 52
4519 // MSB of the input. The low order bits will get merged into a sticky bit
4520 // which will avoid issues incurred by double rounding.
4521
4522 // Signed conversion is more or less like so:
4523 // copysign((__bf16)abs(SrcVal), SrcVal)
4524 SDValue SignBit;
4525 if (IsSigned) {
4526 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4527 DAG.getConstant(1ull << 63, DL, MVT::i64));
4528 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4529 }
4530 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4531 DAG.getConstant(~0xfffull, DL, MVT::i64));
4532 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4533 DAG.getConstant(0xfffull, DL, MVT::i64));
4535 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4536 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4537 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4538 SDValue ToRound =
4539 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4540 SDValue Rounded =
4541 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4542 {Op.getOperand(0), ToRound})
4543 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4544
4545 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4546 if (SignBit) {
4547 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4548 }
4549
4550 SDValue HasHighest = DAG.getSetCC(
4551 DL,
4552 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4553 Highest, Zero64, ISD::SETNE);
4554
4555 SDValue HasLo = DAG.getSetCC(
4556 DL,
4557 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4558 SrcLo, Zero64, ISD::SETNE);
4559
4560 SDValue NeedsAdjustment =
4561 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4562 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4563
4564 SDValue AdjustedBits =
4565 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4566 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4567 return IsStrict
4569 {Op.getValueType(), MVT::Other},
4570 {Rounded.getValue(1), Adjusted,
4571 DAG.getIntPtrConstant(0, DL)})
4572 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4573 DAG.getIntPtrConstant(0, DL, true));
4574 }
4575 }
4576
4577 // f16 conversions are promoted to f32 when full fp16 is not supported.
4578 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4579 return IntToFpViaPromotion(MVT::f32);
4580 }
4581
4582 // i128 conversions are libcalls.
4583 if (SrcVal.getValueType() == MVT::i128)
4584 return SDValue();
4585
4586 // Other conversions are legal, unless it's to the completely software-based
4587 // fp128.
4588 if (Op.getValueType() != MVT::f128)
4589 return Op;
4590 return SDValue();
4591}
4592
4593SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4594 SelectionDAG &DAG) const {
4595 // For iOS, we want to call an alternative entry point: __sincos_stret,
4596 // which returns the values in two S / D registers.
4597 SDLoc dl(Op);
4598 SDValue Arg = Op.getOperand(0);
4599 EVT ArgVT = Arg.getValueType();
4600 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4601
4603 ArgListEntry Entry;
4604
4605 Entry.Node = Arg;
4606 Entry.Ty = ArgTy;
4607 Entry.IsSExt = false;
4608 Entry.IsZExt = false;
4609 Args.push_back(Entry);
4610
4611 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4612 : RTLIB::SINCOS_STRET_F32;
4613 const char *LibcallName = getLibcallName(LC);
4614 SDValue Callee =
4615 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4616
4617 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4619 CLI.setDebugLoc(dl)
4620 .setChain(DAG.getEntryNode())
4621 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4622
4623 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4624 return CallResult.first;
4625}
4626
4627static MVT getSVEContainerType(EVT ContentTy);
4628
4629SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4630 SelectionDAG &DAG) const {
4631 EVT OpVT = Op.getValueType();
4632 EVT ArgVT = Op.getOperand(0).getValueType();
4633
4635 return LowerFixedLengthBitcastToSVE(Op, DAG);
4636
4637 if (OpVT.isScalableVector()) {
4638 // Bitcasting between unpacked vector types of different element counts is
4639 // not a NOP because the live elements are laid out differently.
4640 // 01234567
4641 // e.g. nxv2i32 = XX??XX??
4642 // nxv4f16 = X?X?X?X?
4643 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4644 return SDValue();
4645
4646 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4647 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4648 "Expected int->fp bitcast!");
4649 SDValue ExtResult =
4651 Op.getOperand(0));
4652 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4653 }
4654 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4655 }
4656
4657 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4658 return SDValue();
4659
4660 // Bitcasts between f16 and bf16 are legal.
4661 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4662 return Op;
4663
4664 assert(ArgVT == MVT::i16);
4665 SDLoc DL(Op);
4666
4667 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4668 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4669 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4670}
4671
4672static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4673 if (OrigVT.getSizeInBits() >= 64)
4674 return OrigVT;
4675
4676 assert(OrigVT.isSimple() && "Expecting a simple value type");
4677
4678 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4679 switch (OrigSimpleTy) {
4680 default: llvm_unreachable("Unexpected Vector Type");
4681 case MVT::v2i8:
4682 case MVT::v2i16:
4683 return MVT::v2i32;
4684 case MVT::v4i8:
4685 return MVT::v4i16;
4686 }
4687}
4688
4690 const EVT &OrigTy,
4691 const EVT &ExtTy,
4692 unsigned ExtOpcode) {
4693 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4694 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4695 // 64-bits we need to insert a new extension so that it will be 64-bits.
4696 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4697 if (OrigTy.getSizeInBits() >= 64)
4698 return N;
4699
4700 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4701 EVT NewVT = getExtensionTo64Bits(OrigTy);
4702
4703 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4704}
4705
4706// Returns lane if Op extracts from a two-element vector and lane is constant
4707// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4708static std::optional<uint64_t>
4710 SDNode *OpNode = Op.getNode();
4711 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4712 return std::nullopt;
4713
4714 EVT VT = OpNode->getOperand(0).getValueType();
4715 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4716 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4717 return std::nullopt;
4718
4719 return C->getZExtValue();
4720}
4721
4723 bool isSigned) {
4724 EVT VT = N.getValueType();
4725
4726 if (N.getOpcode() != ISD::BUILD_VECTOR)
4727 return false;
4728
4729 for (const SDValue &Elt : N->op_values()) {
4730 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4731 unsigned EltSize = VT.getScalarSizeInBits();
4732 unsigned HalfSize = EltSize / 2;
4733 if (isSigned) {
4734 if (!isIntN(HalfSize, C->getSExtValue()))
4735 return false;
4736 } else {
4737 if (!isUIntN(HalfSize, C->getZExtValue()))
4738 return false;
4739 }
4740 continue;
4741 }
4742 return false;
4743 }
4744
4745 return true;
4746}
4747
4749 EVT VT = N.getValueType();
4750 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4751
4752 unsigned NumElts = VT.getVectorNumElements();
4753 unsigned OrigEltSize = VT.getScalarSizeInBits();
4754 unsigned EltSize = OrigEltSize / 2;
4755 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4756
4757 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4758 if (DAG.MaskedValueIsZero(N, HiBits))
4759 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4760
4761 if (ISD::isExtOpcode(N.getOpcode()))
4762 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4763 N.getOperand(0).getValueType(), VT,
4764 N.getOpcode());
4765
4766 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4767 SDLoc dl(N);
4769 for (unsigned i = 0; i != NumElts; ++i) {
4770 const APInt &CInt = N.getConstantOperandAPInt(i);
4771 // Element types smaller than 32 bits are not legal, so use i32 elements.
4772 // The values are implicitly truncated so sext vs. zext doesn't matter.
4773 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4774 }
4775 return DAG.getBuildVector(TruncVT, dl, Ops);
4776}
4777
4779 return N.getOpcode() == ISD::SIGN_EXTEND ||
4780 N.getOpcode() == ISD::ANY_EXTEND ||
4781 isExtendedBUILD_VECTOR(N, DAG, true);
4782}
4783
4785 return N.getOpcode() == ISD::ZERO_EXTEND ||
4786 N.getOpcode() == ISD::ANY_EXTEND ||
4787 isExtendedBUILD_VECTOR(N, DAG, false);
4788}
4789
4791 unsigned Opcode = N.getOpcode();
4792 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4793 SDValue N0 = N.getOperand(0);
4794 SDValue N1 = N.getOperand(1);
4795 return N0->hasOneUse() && N1->hasOneUse() &&
4796 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4797 }
4798 return false;
4799}
4800
4802 unsigned Opcode = N.getOpcode();
4803 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4804 SDValue N0 = N.getOperand(0);
4805 SDValue N1 = N.getOperand(1);
4806 return N0->hasOneUse() && N1->hasOneUse() &&
4807 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4808 }
4809 return false;
4810}
4811
4812SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4813 SelectionDAG &DAG) const {
4814 // The rounding mode is in bits 23:22 of the FPSCR.
4815 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4816 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4817 // so that the shift + and get folded into a bitfield extract.
4818 SDLoc dl(Op);
4819
4820 SDValue Chain = Op.getOperand(0);
4821 SDValue FPCR_64 = DAG.getNode(
4822 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4823 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4824 Chain = FPCR_64.getValue(1);
4825 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4826 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4827 DAG.getConstant(1U << 22, dl, MVT::i32));
4828 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4829 DAG.getConstant(22, dl, MVT::i32));
4830 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4831 DAG.getConstant(3, dl, MVT::i32));
4832 return DAG.getMergeValues({AND, Chain}, dl);
4833}
4834
4835SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4836 SelectionDAG &DAG) const {
4837 SDLoc DL(Op);
4838 SDValue Chain = Op->getOperand(0);
4839 SDValue RMValue = Op->getOperand(1);
4840
4841 // The rounding mode is in bits 23:22 of the FPCR.
4842 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4843 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4844 // ((arg - 1) & 3) << 22).
4845 //
4846 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4847 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4848 // generated llvm.set.rounding to ensure this condition.
4849
4850 // Calculate new value of FPCR[23:22].
4851 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4852 DAG.getConstant(1, DL, MVT::i32));
4853 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4854 DAG.getConstant(0x3, DL, MVT::i32));
4855 RMValue =
4856 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4857 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4858 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4859
4860 // Get current value of FPCR.
4861 SDValue Ops[] = {
4862 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4863 SDValue FPCR =
4864 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4865 Chain = FPCR.getValue(1);
4866 FPCR = FPCR.getValue(0);
4867
4868 // Put new rounding mode into FPSCR[23:22].
4869 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4870 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4871 DAG.getConstant(RMMask, DL, MVT::i64));
4872 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4873 SDValue Ops2[] = {
4874 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4875 FPCR};
4876 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4877}
4878
4879SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
4880 SelectionDAG &DAG) const {
4881 SDLoc DL(Op);
4882 SDValue Chain = Op->getOperand(0);
4883
4884 // Get current value of FPCR.
4885 SDValue Ops[] = {
4886 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4887 SDValue FPCR =
4888 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4889 Chain = FPCR.getValue(1);
4890 FPCR = FPCR.getValue(0);
4891
4892 // Truncate FPCR to 32 bits.
4893 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
4894
4895 return DAG.getMergeValues({Result, Chain}, DL);
4896}
4897
4898SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
4899 SelectionDAG &DAG) const {
4900 SDLoc DL(Op);
4901 SDValue Chain = Op->getOperand(0);
4902 SDValue Mode = Op->getOperand(1);
4903
4904 // Extend the specified value to 64 bits.
4905 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
4906
4907 // Set new value of FPCR.
4908 SDValue Ops2[] = {
4909 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
4910 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4911}
4912
4913SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
4914 SelectionDAG &DAG) const {
4915 SDLoc DL(Op);
4916 SDValue Chain = Op->getOperand(0);
4917
4918 // Get current value of FPCR.
4919 SDValue Ops[] = {
4920 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4921 SDValue FPCR =
4922 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4923 Chain = FPCR.getValue(1);
4924 FPCR = FPCR.getValue(0);
4925
4926 // Clear bits that are not reserved.
4927 SDValue FPSCRMasked = DAG.getNode(
4928 ISD::AND, DL, MVT::i64, FPCR,
4930
4931 // Set new value of FPCR.
4932 SDValue Ops2[] = {Chain,
4933 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4934 FPSCRMasked};
4935 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4936}
4937
4938static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4939 SDLoc DL, bool &IsMLA) {
4940 bool IsN0SExt = isSignExtended(N0, DAG);
4941 bool IsN1SExt = isSignExtended(N1, DAG);
4942 if (IsN0SExt && IsN1SExt)
4943 return AArch64ISD::SMULL;
4944
4945 bool IsN0ZExt = isZeroExtended(N0, DAG);
4946 bool IsN1ZExt = isZeroExtended(N1, DAG);
4947
4948 if (IsN0ZExt && IsN1ZExt)
4949 return AArch64ISD::UMULL;
4950
4951 // Select SMULL if we can replace zext with sext.
4952 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4953 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4954 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4955 SDValue ZextOperand;
4956 if (IsN0ZExt)
4957 ZextOperand = N0.getOperand(0);
4958 else
4959 ZextOperand = N1.getOperand(0);
4960 if (DAG.SignBitIsZero(ZextOperand)) {
4961 SDValue NewSext =
4962 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4963 if (IsN0ZExt)
4964 N0 = NewSext;
4965 else
4966 N1 = NewSext;
4967 return AArch64ISD::SMULL;
4968 }
4969 }
4970
4971 // Select UMULL if we can replace the other operand with an extend.
4972 if (IsN0ZExt || IsN1ZExt) {
4973 EVT VT = N0.getValueType();
4975 VT.getScalarSizeInBits() / 2);
4976 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4977 return AArch64ISD::UMULL;
4978 }
4979
4980 if (!IsN1SExt && !IsN1ZExt)
4981 return 0;
4982
4983 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4984 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4985 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4986 IsMLA = true;
4987 return AArch64ISD::SMULL;
4988 }
4989 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4990 IsMLA = true;
4991 return AArch64ISD::UMULL;
4992 }
4993 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4994 std::swap(N0, N1);
4995 IsMLA = true;
4996 return AArch64ISD::UMULL;
4997 }
4998 return 0;
4999}
5000
5001SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5002 EVT VT = Op.getValueType();
5003
5004 bool OverrideNEON = !Subtarget->isNeonAvailable();
5005 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5006 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5007
5008 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5009 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5010 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5011 "unexpected type for custom-lowering ISD::MUL");
5012 SDValue N0 = Op.getOperand(0);
5013 SDValue N1 = Op.getOperand(1);
5014 bool isMLA = false;
5015 EVT OVT = VT;
5016 if (VT.is64BitVector()) {
5017 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5018 isNullConstant(N0.getOperand(1)) &&
5020 isNullConstant(N1.getOperand(1))) {
5021 N0 = N0.getOperand(0);
5022 N1 = N1.getOperand(0);
5023 VT = N0.getValueType();
5024 } else {
5025 if (VT == MVT::v1i64) {
5026 if (Subtarget->hasSVE())
5027 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5028 // Fall through to expand this. It is not legal.
5029 return SDValue();
5030 } else
5031 // Other vector multiplications are legal.
5032 return Op;
5033 }
5034 }
5035
5036 SDLoc DL(Op);
5037 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5038
5039 if (!NewOpc) {
5040 if (VT.getVectorElementType() == MVT::i64) {
5041 // If SVE is available then i64 vector multiplications can also be made
5042 // legal.
5043 if (Subtarget->hasSVE())
5044 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5045 // Fall through to expand this. It is not legal.
5046 return SDValue();
5047 } else
5048 // Other vector multiplications are legal.
5049 return Op;
5050 }
5051
5052 // Legalize to a S/UMULL instruction
5053 SDValue Op0;
5054 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5055 if (!isMLA) {
5056 Op0 = skipExtensionForVectorMULL(N0, DAG);
5058 Op1.getValueType().is64BitVector() &&
5059 "unexpected types for extended operands to VMULL");
5060 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5061 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5062 DAG.getConstant(0, DL, MVT::i64));
5063 }
5064 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5065 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5066 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5069 EVT Op1VT = Op1.getValueType();
5070 return DAG.getNode(
5072 DAG.getNode(N0.getOpcode(), DL, VT,
5073 DAG.getNode(NewOpc, DL, VT,
5074 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5075 DAG.getNode(NewOpc, DL, VT,
5076 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5077 DAG.getConstant(0, DL, MVT::i64));
5078}
5079
5080static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5081 int Pattern) {
5082 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5083 return DAG.getConstant(1, DL, MVT::nxv1i1);
5084 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5085 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5086}
5087
5089 bool IsSigned, bool IsEqual) {
5090 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5091 !isa<ConstantSDNode>(Op.getOperand(2)))
5092 return SDValue();
5093
5094 SDLoc dl(Op);
5095 APInt X = Op.getConstantOperandAPInt(1);
5096 APInt Y = Op.getConstantOperandAPInt(2);
5097 bool Overflow;
5098 APInt NumActiveElems =
5099 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5100
5101 if (Overflow)
5102 return SDValue();
5103
5104 if (IsEqual) {
5105 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5106 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5107 : NumActiveElems.uadd_ov(One, Overflow);
5108 if (Overflow)
5109 return SDValue();
5110 }
5111
5112 std::optional<unsigned> PredPattern =
5114 unsigned MinSVEVectorSize = std::max(
5116 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5117 if (PredPattern != std::nullopt &&
5118 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5119 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5120
5121 return SDValue();
5122}
5123
5124// Returns a safe bitcast between two scalable vector predicates, where
5125// any newly created lanes from a widening bitcast are defined as zero.
5127 SDLoc DL(Op);
5128 EVT InVT = Op.getValueType();
5129
5130 assert(InVT.getVectorElementType() == MVT::i1 &&
5131 VT.getVectorElementType() == MVT::i1 &&
5132 "Expected a predicate-to-predicate bitcast");
5134 InVT.isScalableVector() &&
5135 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5136 "Only expect to cast between legal scalable predicate types!");
5137
5138 // Return the operand if the cast isn't changing type,
5139 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5140 if (InVT == VT)
5141 return Op;
5142
5143 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5144
5145 // We only have to zero the lanes if new lanes are being defined, e.g. when
5146 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5147 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5148 // we can return here.
5149 if (InVT.bitsGT(VT))
5150 return Reinterpret;
5151
5152 // Check if the other lanes are already known to be zeroed by
5153 // construction.
5155 return Reinterpret;
5156
5157 // Zero the newly introduced lanes.
5158 SDValue Mask = DAG.getConstant(1, DL, InVT);
5159 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5160 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5161}
5162
5163SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5164 SDValue Chain, SDLoc DL,
5165 EVT VT) const {
5166 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5168 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5169 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5172 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5174 RetTy, Callee, std::move(Args));
5175 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5176 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5177 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5178 Mask);
5179}
5180
5181// Lower an SME LDR/STR ZA intrinsic
5182// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5183// folded into the instruction
5184// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5185// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5186// and tile slice registers
5187// ldr(%tileslice, %ptr, %vecnum)
5188// ->
5189// %svl = rdsvl
5190// %ptr2 = %ptr + %svl * %vecnum
5191// %tileslice2 = %tileslice + %vecnum
5192// ldr [%tileslice2, 0], [%ptr2, 0]
5193// Case 3: If the vecnum is an immediate out of range, then the same is done as
5194// case 2, but the base and slice registers are modified by the greatest
5195// multiple of 15 lower than the vecnum and the remainder is folded into the
5196// instruction. This means that successive loads and stores that are offset from
5197// each other can share the same base and slice register updates.
5198// ldr(%tileslice, %ptr, 22)
5199// ldr(%tileslice, %ptr, 23)
5200// ->
5201// %svl = rdsvl
5202// %ptr2 = %ptr + %svl * 15
5203// %tileslice2 = %tileslice + 15
5204// ldr [%tileslice2, 7], [%ptr2, 7]
5205// ldr [%tileslice2, 8], [%ptr2, 8]
5206// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5207// operand and the immediate can be folded into the instruction, like case 2.
5208// ldr(%tileslice, %ptr, %vecnum + 7)
5209// ldr(%tileslice, %ptr, %vecnum + 8)
5210// ->
5211// %svl = rdsvl
5212// %ptr2 = %ptr + %svl * %vecnum
5213// %tileslice2 = %tileslice + %vecnum
5214// ldr [%tileslice2, 7], [%ptr2, 7]
5215// ldr [%tileslice2, 8], [%ptr2, 8]
5216// Case 5: The vecnum being an add of an immediate out of range is also handled,
5217// in which case the same remainder logic as case 3 is used.
5219 SDLoc DL(N);
5220
5221 SDValue TileSlice = N->getOperand(2);
5222 SDValue Base = N->getOperand(3);
5223 SDValue VecNum = N->getOperand(4);
5224 int32_t ConstAddend = 0;
5225 SDValue VarAddend = VecNum;
5226
5227 // If the vnum is an add of an immediate, we can fold it into the instruction
5228 if (VecNum.getOpcode() == ISD::ADD &&
5229 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5230 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5231 VarAddend = VecNum.getOperand(0);
5232 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5233 ConstAddend = ImmNode->getSExtValue();
5234 VarAddend = SDValue();
5235 }
5236
5237 int32_t ImmAddend = ConstAddend % 16;
5238 if (int32_t C = (ConstAddend - ImmAddend)) {
5239 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5240 VarAddend = VarAddend
5241 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5242 : CVal;
5243 }
5244
5245 if (VarAddend) {
5246 // Get the vector length that will be multiplied by vnum
5247 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5248 DAG.getConstant(1, DL, MVT::i32));
5249
5250 // Multiply SVL and vnum then add it to the base
5251 SDValue Mul = DAG.getNode(
5252 ISD::MUL, DL, MVT::i64,
5253 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5254 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5255 // Just add vnum to the tileslice
5256 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5257 }
5258
5260 DL, MVT::Other,
5261 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5262 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5263}
5264
5265SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5266 SelectionDAG &DAG) const {
5267 unsigned IntNo = Op.getConstantOperandVal(1);
5268 SDLoc DL(Op);
5269 switch (IntNo) {
5270 default:
5271 return SDValue(); // Don't custom lower most intrinsics.
5272 case Intrinsic::aarch64_prefetch: {
5273 SDValue Chain = Op.getOperand(0);
5274 SDValue Addr = Op.getOperand(2);
5275
5276 unsigned IsWrite = Op.getConstantOperandVal(3);
5277 unsigned Locality = Op.getConstantOperandVal(4);
5278 unsigned IsStream = Op.getConstantOperandVal(5);
5279 unsigned IsData = Op.getConstantOperandVal(6);
5280 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5281 (!IsData << 3) | // IsDataCache bit
5282 (Locality << 1) | // Cache level bits
5283 (unsigned)IsStream; // Stream bit
5284
5285 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5286 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5287 }
5288 case Intrinsic::aarch64_sme_str:
5289 case Intrinsic::aarch64_sme_ldr: {
5290 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5291 }
5292 case Intrinsic::aarch64_sme_za_enable:
5293 return DAG.getNode(
5294 AArch64ISD::SMSTART, DL, MVT::Other,
5295 Op->getOperand(0), // Chain
5296 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5297 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5298 case Intrinsic::aarch64_sme_za_disable:
5299 return DAG.getNode(
5300 AArch64ISD::SMSTOP, DL, MVT::Other,
5301 Op->getOperand(0), // Chain
5302 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5303 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5304 }
5305}
5306
5307SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5308 SelectionDAG &DAG) const {
5309 unsigned IntNo = Op.getConstantOperandVal(1);
5310 SDLoc DL(Op);
5311 switch (IntNo) {
5312 default:
5313 return SDValue(); // Don't custom lower most intrinsics.
5314 case Intrinsic::aarch64_mops_memset_tag: {
5315 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5316 SDValue Chain = Node->getChain();
5317 SDValue Dst = Op.getOperand(2);
5318 SDValue Val = Op.getOperand(3);
5319 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5320 SDValue Size = Op.getOperand(4);
5321 auto Alignment = Node->getMemOperand()->getAlign();
5322 bool IsVol = Node->isVolatile();
5323 auto DstPtrInfo = Node->getPointerInfo();
5324
5325 const auto &SDI =
5326 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5327 SDValue MS =
5328 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5329 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5330
5331 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5332 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5333 // LowerOperationWrapper will complain that the number of results has
5334 // changed.
5335 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5336 }
5337 }
5338}
5339
5340SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5341 SelectionDAG &DAG) const {
5342 unsigned IntNo = Op.getConstantOperandVal(0);
5343 SDLoc dl(Op);
5344 switch (IntNo) {
5345 default: return SDValue(); // Don't custom lower most intrinsics.
5346 case Intrinsic::thread_pointer: {
5347 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5348 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5349 }
5350 case Intrinsic::aarch64_neon_abs: {
5351 EVT Ty = Op.getValueType();
5352 if (Ty == MVT::i64) {
5353 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5354 Op.getOperand(1));
5355 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5356 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5357 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5358 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5359 } else {
5360 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5361 }
5362 }
5363 case Intrinsic::aarch64_neon_pmull64: {
5364 SDValue LHS = Op.getOperand(1);
5365 SDValue RHS = Op.getOperand(2);
5366
5367 std::optional<uint64_t> LHSLane =
5369 std::optional<uint64_t> RHSLane =
5371
5372 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5373 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5374
5375 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5376 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5377 // which ISel recognizes better. For example, generate a ldr into d*
5378 // registers as opposed to a GPR load followed by a fmov.
5379 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5380 std::optional<uint64_t> OtherLane,
5381 const SDLoc &dl,
5382 SelectionDAG &DAG) -> SDValue {
5383 // If the operand is an higher half itself, rewrite it to
5384 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5385 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5386 if (NLane && *NLane == 1)
5387 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5388 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5389
5390 // Operand N is not a higher half but the other operand is.
5391 if (OtherLane && *OtherLane == 1) {
5392 // If this operand is a lower half, rewrite it to
5393 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5394 // align lanes of two operands. A roundtrip sequence (to move from lane
5395 // 1 to lane 0) is like this:
5396 // mov x8, v0.d[1]
5397 // fmov d0, x8
5398 if (NLane && *NLane == 0)
5399 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5400 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5401 N.getOperand(0),
5402 DAG.getConstant(0, dl, MVT::i64)),
5403 DAG.getConstant(1, dl, MVT::i64));
5404
5405 // Otherwise just dup from main to all lanes.
5406 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5407 }
5408
5409 // Neither operand is an extract of higher half, so codegen may just use
5410 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5411 assert(N.getValueType() == MVT::i64 &&
5412 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5413 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5414 };
5415
5416 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5417 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5418
5419 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5420 }
5421 case Intrinsic::aarch64_neon_smax:
5422 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5423 Op.getOperand(1), Op.getOperand(2));
5424 case Intrinsic::aarch64_neon_umax:
5425 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5426 Op.getOperand(1), Op.getOperand(2));
5427 case Intrinsic::aarch64_neon_smin:
5428 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5429 Op.getOperand(1), Op.getOperand(2));
5430 case Intrinsic::aarch64_neon_umin:
5431 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5432 Op.getOperand(1), Op.getOperand(2));
5433 case Intrinsic::aarch64_neon_scalar_sqxtn:
5434 case Intrinsic::aarch64_neon_scalar_sqxtun:
5435 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5436 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5437 if (Op.getValueType() == MVT::i32)
5438 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5439 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5440 Op.getOperand(0),
5441 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5442 Op.getOperand(1))));
5443 return SDValue();
5444 }
5445 case Intrinsic::aarch64_sve_whilelo:
5446 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5447 /*IsEqual=*/false);
5448 case Intrinsic::aarch64_sve_whilelt:
5449 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5450 /*IsEqual=*/false);
5451 case Intrinsic::aarch64_sve_whilels:
5452 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5453 /*IsEqual=*/true);
5454 case Intrinsic::aarch64_sve_whilele:
5455 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5456 /*IsEqual=*/true);
5457 case Intrinsic::aarch64_sve_sunpkhi:
5458 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5459 Op.getOperand(1));
5460 case Intrinsic::aarch64_sve_sunpklo:
5461 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5462 Op.getOperand(1));
5463 case Intrinsic::aarch64_sve_uunpkhi:
5464 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5465 Op.getOperand(1));
5466 case Intrinsic::aarch64_sve_uunpklo:
5467 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5468 Op.getOperand(1));
5469 case Intrinsic::aarch64_sve_clasta_n:
5470 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5471 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5472 case Intrinsic::aarch64_sve_clastb_n:
5473 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5474 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5475 case Intrinsic::aarch64_sve_lasta:
5476 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5477 Op.getOperand(1), Op.getOperand(2));
5478 case Intrinsic::aarch64_sve_lastb:
5479 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5480 Op.getOperand(1), Op.getOperand(2));
5481 case Intrinsic::aarch64_sve_rev:
5482 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5483 Op.getOperand(1));
5484 case Intrinsic::aarch64_sve_tbl:
5485 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5486 Op.getOperand(1), Op.getOperand(2));
5487 case Intrinsic::aarch64_sve_trn1:
5488 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5489 Op.getOperand(1), Op.getOperand(2));
5490 case Intrinsic::aarch64_sve_trn2:
5491 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5492 Op.getOperand(1), Op.getOperand(2));
5493 case Intrinsic::aarch64_sve_uzp1:
5494 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5495 Op.getOperand(1), Op.getOperand(2));
5496 case Intrinsic::aarch64_sve_uzp2:
5497 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5498 Op.getOperand(1), Op.getOperand(2));
5499 case Intrinsic::aarch64_sve_zip1:
5500 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5501 Op.getOperand(1), Op.getOperand(2));
5502 case Intrinsic::aarch64_sve_zip2:
5503 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5504 Op.getOperand(1), Op.getOperand(2));
5505 case Intrinsic::aarch64_sve_splice:
5506 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5507 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5508 case Intrinsic::aarch64_sve_ptrue:
5509 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5510 case Intrinsic::aarch64_sve_clz:
5511 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5512 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5513 case Intrinsic::aarch64_sme_cntsb:
5514 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5515 DAG.getConstant(1, dl, MVT::i32));
5516 case Intrinsic::aarch64_sme_cntsh: {
5517 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5518 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5519 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5520 }
5521 case Intrinsic::aarch64_sme_cntsw: {
5522 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5523 DAG.getConstant(1, dl, MVT::i32));
5524 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5525 DAG.getConstant(2, dl, MVT::i32));
5526 }
5527 case Intrinsic::aarch64_sme_cntsd: {
5528 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5529 DAG.getConstant(1, dl, MVT::i32));
5530 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5531 DAG.getConstant(3, dl, MVT::i32));
5532 }
5533 case Intrinsic::aarch64_sve_cnt: {
5534 SDValue Data = Op.getOperand(3);
5535 // CTPOP only supports integer operands.
5536 if (Data.getValueType().isFloatingPoint())
5537 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5538 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5539 Op.getOperand(2), Data, Op.getOperand(1));
5540 }
5541 case Intrinsic::aarch64_sve_dupq_lane:
5542 return LowerDUPQLane(Op, DAG);
5543 case Intrinsic::aarch64_sve_convert_from_svbool:
5544 if (Op.getValueType() == MVT::aarch64svcount)
5545 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5546 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5547 case Intrinsic::aarch64_sve_convert_to_svbool:
5548 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5549 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5550 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5551 case Intrinsic::aarch64_sve_fneg:
5552 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5553 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5554 case Intrinsic::aarch64_sve_frintp:
5555 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5556 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5557 case Intrinsic::aarch64_sve_frintm:
5558 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5559 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5560 case Intrinsic::aarch64_sve_frinti:
5561 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5562 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5563 case Intrinsic::aarch64_sve_frintx:
5564 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5565 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5566 case Intrinsic::aarch64_sve_frinta:
5567 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5568 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5569 case Intrinsic::aarch64_sve_frintn:
5570 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5571 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5572 case Intrinsic::aarch64_sve_frintz:
5573 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5574 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5575 case Intrinsic::aarch64_sve_ucvtf:
5577 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5578 Op.getOperand(1));
5579 case Intrinsic::aarch64_sve_scvtf:
5581 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5582 Op.getOperand(1));
5583 case Intrinsic::aarch64_sve_fcvtzu:
5585 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5586 Op.getOperand(1));
5587 case Intrinsic::aarch64_sve_fcvtzs:
5589 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5590 Op.getOperand(1));
5591 case Intrinsic::aarch64_sve_fsqrt:
5592 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5593 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5594 case Intrinsic::aarch64_sve_frecpx:
5595 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5596 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5597 case Intrinsic::aarch64_sve_frecpe_x:
5598 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5599 Op.getOperand(1));
5600 case Intrinsic::aarch64_sve_frecps_x:
5601 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5602 Op.getOperand(1), Op.getOperand(2));
5603 case Intrinsic::aarch64_sve_frsqrte_x:
5604 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5605 Op.getOperand(1));
5606 case Intrinsic::aarch64_sve_frsqrts_x:
5607 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5608 Op.getOperand(1), Op.getOperand(2));
5609 case Intrinsic::aarch64_sve_fabs:
5610 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5611 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5612 case Intrinsic::aarch64_sve_abs:
5613 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5614 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5615 case Intrinsic::aarch64_sve_neg:
5616 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5617 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5618 case Intrinsic::aarch64_sve_insr: {
5619 SDValue Scalar = Op.getOperand(2);
5620 EVT ScalarTy = Scalar.getValueType();
5621 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5622 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5623
5624 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5625 Op.getOperand(1), Scalar);
5626 }
5627 case Intrinsic::aarch64_sve_rbit:
5629 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5630 Op.getOperand(1));
5631 case Intrinsic::aarch64_sve_revb:
5632 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5633 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5634 case Intrinsic::aarch64_sve_revh:
5635 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5636 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5637 case Intrinsic::aarch64_sve_revw:
5638 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5639 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5640 case Intrinsic::aarch64_sve_revd:
5641 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5642 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5643 case Intrinsic::aarch64_sve_sxtb:
5644 return DAG.getNode(
5646 Op.getOperand(2), Op.getOperand(3),
5647 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5648 Op.getOperand(1));
5649 case Intrinsic::aarch64_sve_sxth:
5650 return DAG.getNode(
5652 Op.getOperand(2), Op.getOperand(3),
5653 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5654 Op.getOperand(1));
5655 case Intrinsic::aarch64_sve_sxtw:
5656 return DAG.getNode(
5658 Op.getOperand(2), Op.getOperand(3),
5659 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5660 Op.getOperand(1));
5661 case Intrinsic::aarch64_sve_uxtb:
5662 return DAG.getNode(
5664 Op.getOperand(2), Op.getOperand(3),
5665 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5666 Op.getOperand(1));
5667 case Intrinsic::aarch64_sve_uxth:
5668 return DAG.getNode(
5670 Op.getOperand(2), Op.getOperand(3),
5671 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5672 Op.getOperand(1));
5673 case Intrinsic::aarch64_sve_uxtw:
5674 return DAG.getNode(
5676 Op.getOperand(2), Op.getOperand(3),
5677 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5678 Op.getOperand(1));
5679 case Intrinsic::localaddress: {
5680 const auto &MF = DAG.getMachineFunction();
5681 const auto *RegInfo = Subtarget->getRegisterInfo();
5682 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5683 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5684 Op.getSimpleValueType());
5685 }
5686
5687 case Intrinsic::eh_recoverfp: {
5688 // FIXME: This needs to be implemented to correctly handle highly aligned
5689 // stack objects. For now we simply return the incoming FP. Refer D53541
5690 // for more details.
5691 SDValue FnOp = Op.getOperand(1);
5692 SDValue IncomingFPOp = Op.getOperand(2);
5693 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5694 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5695 if (!Fn)
5697 "llvm.eh.recoverfp must take a function as the first argument");
5698 return IncomingFPOp;
5699 }
5700
5701 case Intrinsic::aarch64_neon_vsri:
5702 case Intrinsic::aarch64_neon_vsli:
5703 case Intrinsic::aarch64_sve_sri:
5704 case Intrinsic::aarch64_sve_sli: {
5705 EVT Ty = Op.getValueType();
5706
5707 if (!Ty.isVector())
5708 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5709
5710 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5711
5712 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5713 IntNo == Intrinsic::aarch64_sve_sri;
5714 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5715 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5716 Op.getOperand(3));
5717 }
5718
5719 case Intrinsic::aarch64_neon_srhadd:
5720 case Intrinsic::aarch64_neon_urhadd:
5721 case Intrinsic::aarch64_neon_shadd:
5722 case Intrinsic::aarch64_neon_uhadd: {
5723 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5724 IntNo == Intrinsic::aarch64_neon_shadd);
5725 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5726 IntNo == Intrinsic::aarch64_neon_urhadd);
5727 unsigned Opcode = IsSignedAdd
5728 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5729 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5730 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5731 Op.getOperand(2));
5732 }
5733 case Intrinsic::aarch64_neon_saddlp:
5734 case Intrinsic::aarch64_neon_uaddlp: {
5735 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5738 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5739 }
5740 case Intrinsic::aarch64_neon_sdot:
5741 case Intrinsic::aarch64_neon_udot:
5742 case Intrinsic::aarch64_sve_sdot:
5743 case Intrinsic::aarch64_sve_udot: {
5744 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5745 IntNo == Intrinsic::aarch64_sve_udot)
5748 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5749 Op.getOperand(2), Op.getOperand(3));
5750 }
5751 case Intrinsic::get_active_lane_mask: {
5752 SDValue ID =
5753 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5754
5755 EVT VT = Op.getValueType();
5756 if (VT.isScalableVector())
5757 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
5758 Op.getOperand(2));
5759
5760 // We can use the SVE whilelo instruction to lower this intrinsic by
5761 // creating the appropriate sequence of scalable vector operations and
5762 // then extracting a fixed-width subvector from the scalable vector.
5763
5764 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
5765 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5766
5767 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
5768 Op.getOperand(1), Op.getOperand(2));
5769 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
5770 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
5771 DAG.getVectorIdxConstant(0, dl));
5772 }
5773 case Intrinsic::aarch64_neon_uaddlv: {
5774 EVT OpVT = Op.getOperand(1).getValueType();
5775 EVT ResVT = Op.getValueType();
5776 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5777 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5778 // In order to avoid insert_subvector, used v4i32 than v2i32.
5779 SDValue UADDLV =
5780 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5781 SDValue EXTRACT_VEC_ELT =
5782 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5783 DAG.getConstant(0, dl, MVT::i64));
5784 return EXTRACT_VEC_ELT;
5785 }
5786 return SDValue();
5787 }
5788 case Intrinsic::experimental_cttz_elts: {
5789 SDValue NewCttzElts =
5790 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5791
5792 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5793 }
5794 }
5795}
5796
5797bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5798 if (VT.getVectorElementType() == MVT::i8 ||
5799 VT.getVectorElementType() == MVT::i16) {
5800 EltTy = MVT::i32;
5801 return true;
5802 }
5803 return false;
5804}
5805
5806bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5807 EVT DataVT) const {
5808 const EVT IndexVT = Extend.getOperand(0).getValueType();
5809 // SVE only supports implicit extension of 32-bit indices.
5810 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5811 return false;
5812
5813 // Indices cannot be smaller than the main data type.
5814 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5815 return false;
5816
5817 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5818 // element container type, which would violate the previous clause.
5819 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5820}
5821
5822bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5823 EVT ExtVT = ExtVal.getValueType();
5824 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5825 return false;
5826
5827 // It may be worth creating extending masked loads if there are multiple
5828 // masked loads using the same predicate. That way we'll end up creating
5829 // extending masked loads that may then get split by the legaliser. This
5830 // results in just one set of predicate unpacks at the start, instead of
5831 // multiple sets of vector unpacks after each load.
5832 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5833 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5834 // Disable extending masked loads for fixed-width for now, since the code
5835 // quality doesn't look great.
5836 if (!ExtVT.isScalableVector())
5837 return false;
5838
5839 unsigned NumExtMaskedLoads = 0;
5840 for (auto *U : Ld->getMask()->uses())
5841 if (isa<MaskedLoadSDNode>(U))
5842 NumExtMaskedLoads++;
5843
5844 if (NumExtMaskedLoads <= 1)
5845 return false;
5846 }
5847 }
5848
5849 return true;
5850}
5851
5852unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5853 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5854 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5856 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5858 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5860 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5862 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5864 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5866 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5868 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5870 };
5871 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5872 return AddrModes.find(Key)->second;
5873}
5874
5875unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5876 switch (Opcode) {
5877 default:
5878 llvm_unreachable("unimplemented opcode");
5879 return Opcode;
5894 }
5895}
5896
5897SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5898 SelectionDAG &DAG) const {
5899 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5900
5901 SDLoc DL(Op);
5902 SDValue Chain = MGT->getChain();
5903 SDValue PassThru = MGT->getPassThru();
5904 SDValue Mask = MGT->getMask();
5905 SDValue BasePtr = MGT->getBasePtr();
5906 SDValue Index = MGT->getIndex();
5907 SDValue Scale = MGT->getScale();
5908 EVT VT = Op.getValueType();
5909 EVT MemVT = MGT->getMemoryVT();
5910 ISD::LoadExtType ExtType = MGT->getExtensionType();
5911 ISD::MemIndexType IndexType = MGT->getIndexType();
5912
5913 // SVE supports zero (and so undef) passthrough values only, everything else
5914 // must be handled manually by an explicit select on the load's output.
5915 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5916 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5917 SDValue Load =
5918 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5919 MGT->getMemOperand(), IndexType, ExtType);
5920 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5921 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5922 }
5923
5924 bool IsScaled = MGT->isIndexScaled();
5925 bool IsSigned = MGT->isIndexSigned();
5926
5927 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5928 // must be calculated before hand.
5929 uint64_t ScaleVal = Scale->getAsZExtVal();
5930 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5931 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5932 EVT IndexVT = Index.getValueType();
5933 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5934 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5935 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5936
5937 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5938 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5939 MGT->getMemOperand(), IndexType, ExtType);
5940 }
5941
5942 // Lower fixed length gather to a scalable equivalent.
5943 if (VT.isFixedLengthVector()) {
5944 assert(Subtarget->useSVEForFixedLengthVectors() &&
5945 "Cannot lower when not using SVE for fixed vectors!");
5946
5947 // NOTE: Handle floating-point as if integer then bitcast the result.
5949 MemVT = MemVT.changeVectorElementTypeToInteger();
5950
5951 // Find the smallest integer fixed length vector we can use for the gather.
5952 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5953 if (DataVT.getVectorElementType() == MVT::i64 ||
5954 Index.getValueType().getVectorElementType() == MVT::i64 ||
5955 Mask.getValueType().getVectorElementType() == MVT::i64)
5956 PromotedVT = VT.changeVectorElementType(MVT::i64);
5957
5958 // Promote vector operands except for passthrough, which we know is either
5959 // undef or zero, and thus best constructed directly.
5960 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5961 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5962 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5963
5964 // A promoted result type forces the need for an extending load.
5965 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5966 ExtType = ISD::EXTLOAD;
5967
5968 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5969
5970 // Convert fixed length vector operands to scalable.
5971 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5972 Index = convertToScalableVector(DAG, ContainerVT, Index);
5974 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5975 : DAG.getConstant(0, DL, ContainerVT);
5976
5977 // Emit equivalent scalable vector gather.
5978 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5979 SDValue Load =
5980 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5981 Ops, MGT->getMemOperand(), IndexType, ExtType);
5982
5983 // Extract fixed length data then convert to the required result type.
5984 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5985 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5986 if (VT.isFloatingPoint())
5987 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5988
5989 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5990 }
5991
5992 // Everything else is legal.
5993 return Op;
5994}
5995
5996SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5997 SelectionDAG &DAG) const {
5998 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5999
6000 SDLoc DL(Op);
6001 SDValue Chain = MSC->getChain();
6002 SDValue StoreVal = MSC->getValue();
6003 SDValue Mask = MSC->getMask();
6004 SDValue BasePtr = MSC->getBasePtr();
6005 SDValue Index = MSC->getIndex();
6006 SDValue Scale = MSC->getScale();
6007 EVT VT = StoreVal.getValueType();
6008 EVT MemVT = MSC->getMemoryVT();
6009 ISD::MemIndexType IndexType = MSC->getIndexType();
6010 bool Truncating = MSC->isTruncatingStore();
6011
6012 bool IsScaled = MSC->isIndexScaled();
6013 bool IsSigned = MSC->isIndexSigned();
6014
6015 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6016 // must be calculated before hand.
6017 uint64_t ScaleVal = Scale->getAsZExtVal();
6018 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6019 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6020 EVT IndexVT = Index.getValueType();
6021 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6022 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6023 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6024
6025 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6026 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6027 MSC->getMemOperand(), IndexType, Truncating);
6028 }
6029
6030 // Lower fixed length scatter to a scalable equivalent.
6031 if (VT.isFixedLengthVector()) {
6032 assert(Subtarget->useSVEForFixedLengthVectors() &&
6033 "Cannot lower when not using SVE for fixed vectors!");
6034
6035 // Once bitcast we treat floating-point scatters as if integer.
6036 if (VT.isFloatingPoint()) {
6038 MemVT = MemVT.changeVectorElementTypeToInteger();
6039 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6040 }
6041
6042 // Find the smallest integer fixed length vector we can use for the scatter.
6043 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6044 if (VT.getVectorElementType() == MVT::i64 ||
6045 Index.getValueType().getVectorElementType() == MVT::i64 ||
6046 Mask.getValueType().getVectorElementType() == MVT::i64)
6047 PromotedVT = VT.changeVectorElementType(MVT::i64);
6048
6049 // Promote vector operands.
6050 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6051 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6052 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6053 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6054
6055 // A promoted value type forces the need for a truncating store.
6056 if (PromotedVT != VT)
6057 Truncating = true;
6058
6059 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6060
6061 // Convert fixed length vector operands to scalable.
6062 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6063 Index = convertToScalableVector(DAG, ContainerVT, Index);
6065 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6066
6067 // Emit equivalent scalable vector scatter.
6068 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6069 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6070 MSC->getMemOperand(), IndexType, Truncating);
6071 }
6072
6073 // Everything else is legal.
6074 return Op;
6075}
6076
6077SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6078 SDLoc DL(Op);
6079 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6080 assert(LoadNode && "Expected custom lowering of a masked load node");
6081 EVT VT = Op->getValueType(0);
6082
6083 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6084 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6085
6086 SDValue PassThru = LoadNode->getPassThru();
6087 SDValue Mask = LoadNode->getMask();
6088
6089 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6090 return Op;
6091
6093 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6094 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6095 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6096 LoadNode->getExtensionType());
6097
6098 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6099
6100 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6101}
6102
6103// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6105 EVT VT, EVT MemVT,
6106 SelectionDAG &DAG) {
6107 assert(VT.isVector() && "VT should be a vector type");
6108 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6109
6110 SDValue Value = ST->getValue();
6111
6112 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6113 // the word lane which represent the v4i8 subvector. It optimizes the store
6114 // to:
6115 //
6116 // xtn v0.8b, v0.8h
6117 // str s0, [x0]
6118
6119 SDValue Undef = DAG.getUNDEF(MVT::i16);
6120 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6121 {Undef, Undef, Undef, Undef});
6122
6123 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6124 Value, UndefVec);
6125 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6126
6127 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6128 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6129 Trunc, DAG.getConstant(0, DL, MVT::i64));
6130
6131 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6132 ST->getBasePtr(), ST->getMemOperand());
6133}
6134
6135// Custom lowering for any store, vector or scalar and/or default or with
6136// a truncate operations. Currently only custom lower truncate operation
6137// from vector v4i16 to v4i8 or volatile stores of i128.
6138SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6139 SelectionDAG &DAG) const {
6140 SDLoc Dl(Op);
6141 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6142 assert (StoreNode && "Can only custom lower store nodes");
6143
6144 SDValue Value = StoreNode->getValue();
6145
6146 EVT VT = Value.getValueType();
6147 EVT MemVT = StoreNode->getMemoryVT();
6148
6149 if (VT.isVector()) {
6151 VT,
6152 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6153 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6154
6155 unsigned AS = StoreNode->getAddressSpace();
6156 Align Alignment = StoreNode->getAlign();
6157 if (Alignment < MemVT.getStoreSize() &&
6158 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6159 StoreNode->getMemOperand()->getFlags(),
6160 nullptr)) {
6161 return scalarizeVectorStore(StoreNode, DAG);
6162 }
6163
6164 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6165 MemVT == MVT::v4i8) {
6166 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6167 }
6168 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6169 // the custom lowering, as there are no un-paired non-temporal stores and
6170 // legalization will break up 256 bit inputs.
6172 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6173 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6174 (MemVT.getScalarSizeInBits() == 8u ||
6175 MemVT.getScalarSizeInBits() == 16u ||
6176 MemVT.getScalarSizeInBits() == 32u ||
6177 MemVT.getScalarSizeInBits() == 64u)) {
6178 SDValue Lo =
6181 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6182 SDValue Hi =
6185 StoreNode->getValue(),
6186 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6188 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6189 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6190 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6191 return Result;
6192 }
6193 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6194 return LowerStore128(Op, DAG);
6195 } else if (MemVT == MVT::i64x8) {
6196 SDValue Value = StoreNode->getValue();
6197 assert(Value->getValueType(0) == MVT::i64x8);
6198 SDValue Chain = StoreNode->getChain();
6199 SDValue Base = StoreNode->getBasePtr();
6200 EVT PtrVT = Base.getValueType();
6201 for (unsigned i = 0; i < 8; i++) {
6202 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6203 Value, DAG.getConstant(i, Dl, MVT::i32));
6204 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6205 DAG.getConstant(i * 8, Dl, PtrVT));
6206 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6207 StoreNode->getOriginalAlign());
6208 }
6209 return Chain;
6210 }
6211
6212 return SDValue();
6213}
6214
6215/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6216SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6217 SelectionDAG &DAG) const {
6218 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6219 assert(StoreNode->getMemoryVT() == MVT::i128);
6220 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6221
6222 bool IsStoreRelease =
6224 if (StoreNode->isAtomic())
6225 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6226 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6229
6230 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6231 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6232 ? StoreNode->getOperand(1)
6233 : StoreNode->getOperand(2);
6234 SDLoc DL(Op);
6235 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6236 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6237 if (DAG.getDataLayout().isBigEndian())
6238 std::swap(StoreValue.first, StoreValue.second);
6240 Opcode, DL, DAG.getVTList(MVT::Other),
6241 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6242 StoreNode->getBasePtr()},
6243 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6244 return Result;
6245}
6246
6247SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6248 SelectionDAG &DAG) const {
6249 SDLoc DL(Op);
6250 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6251 assert(LoadNode && "Expected custom lowering of a load node");
6252
6253 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6255 SDValue Base = LoadNode->getBasePtr();
6256 SDValue Chain = LoadNode->getChain();
6257 EVT PtrVT = Base.getValueType();
6258 for (unsigned i = 0; i < 8; i++) {
6259 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6260 DAG.getConstant(i * 8, DL, PtrVT));
6261 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6262 LoadNode->getPointerInfo(),
6263 LoadNode->getOriginalAlign());
6264 Ops.push_back(Part);
6265 Chain = SDValue(Part.getNode(), 1);
6266 }
6267 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6268 return DAG.getMergeValues({Loaded, Chain}, DL);
6269 }
6270
6271 // Custom lowering for extending v4i8 vector loads.
6272 EVT VT = Op->getValueType(0);
6273 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6274
6275 if (LoadNode->getMemoryVT() != MVT::v4i8)
6276 return SDValue();
6277
6278 unsigned ExtType;
6279 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6280 ExtType = ISD::SIGN_EXTEND;
6281 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6282 LoadNode->getExtensionType() == ISD::EXTLOAD)
6283 ExtType = ISD::ZERO_EXTEND;
6284 else
6285 return SDValue();
6286
6287 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6288 LoadNode->getBasePtr(), MachinePointerInfo());
6289 SDValue Chain = Load.getValue(1);
6290 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6291 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6292 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6293 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6294 DAG.getConstant(0, DL, MVT::i64));
6295 if (VT == MVT::v4i32)
6296 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6297 return DAG.getMergeValues({Ext, Chain}, DL);
6298}
6299
6300// Generate SUBS and CSEL for integer abs.
6301SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6302 MVT VT = Op.getSimpleValueType();
6303
6304 if (VT.isVector())
6305 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6306
6307 SDLoc DL(Op);
6308 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6309 Op.getOperand(0));
6310 // Generate SUBS & CSEL.
6311 SDValue Cmp =
6312 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6313 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6314 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6315 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6316 Cmp.getValue(1));
6317}
6318
6320 SDValue Chain = Op.getOperand(0);
6321 SDValue Cond = Op.getOperand(1);
6322 SDValue Dest = Op.getOperand(2);
6323
6325 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6326 SDLoc dl(Op);
6327 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6328 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6329 Cmp);
6330 }
6331
6332 return SDValue();
6333}
6334
6335// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6336// FSHL is converted to FSHR before deciding what to do with it
6338 SDValue Shifts = Op.getOperand(2);
6339 // Check if the shift amount is a constant
6340 // If opcode is FSHL, convert it to FSHR
6341 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6342 SDLoc DL(Op);
6343 MVT VT = Op.getSimpleValueType();
6344
6345 if (Op.getOpcode() == ISD::FSHL) {
6346 unsigned int NewShiftNo =
6347 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6348 return DAG.getNode(
6349 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6350 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6351 } else if (Op.getOpcode() == ISD::FSHR) {
6352 return Op;
6353 }
6354 }
6355
6356 return SDValue();
6357}
6358
6360 SDValue X = Op.getOperand(0);
6361 EVT XScalarTy = X.getValueType();
6362 SDValue Exp = Op.getOperand(1);
6363
6364 SDLoc DL(Op);
6365 EVT XVT, ExpVT;
6366 switch (Op.getSimpleValueType().SimpleTy) {
6367 default:
6368 return SDValue();
6369 case MVT::bf16:
6370 case MVT::f16:
6371 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6372 [[fallthrough]];
6373 case MVT::f32:
6374 XVT = MVT::nxv4f32;
6375 ExpVT = MVT::nxv4i32;
6376 break;
6377 case MVT::f64:
6378 XVT = MVT::nxv2f64;
6379 ExpVT = MVT::nxv2i64;
6380 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6381 break;
6382 }
6383
6384 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6385 SDValue VX =
6386 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6387 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6388 DAG.getUNDEF(ExpVT), Exp, Zero);
6389 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6390 AArch64SVEPredPattern::all);
6391 SDValue FScale =
6393 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6394 VPg, VX, VExp);
6395 SDValue Final =
6396 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6397 if (X.getValueType() != XScalarTy)
6398 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6399 DAG.getIntPtrConstant(1, SDLoc(Op)));
6400 return Final;
6401}
6402
6404 SelectionDAG &DAG) const {
6405 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6406 LLVM_DEBUG(Op.dump());
6407
6408 switch (Op.getOpcode()) {
6409 default:
6410 llvm_unreachable("unimplemented operand");
6411 return SDValue();
6412 case ISD::BITCAST:
6413 return LowerBITCAST(Op, DAG);
6414 case ISD::GlobalAddress:
6415 return LowerGlobalAddress(Op, DAG);
6417 return LowerGlobalTLSAddress(Op, DAG);
6418 case ISD::SETCC:
6419 case ISD::STRICT_FSETCC:
6421 return LowerSETCC(Op, DAG);
6422 case ISD::SETCCCARRY:
6423 return LowerSETCCCARRY(Op, DAG);
6424 case ISD::BRCOND:
6425 return LowerBRCOND(Op, DAG);
6426 case ISD::BR_CC:
6427 return LowerBR_CC(Op, DAG);
6428 case ISD::SELECT:
6429 return LowerSELECT(Op, DAG);
6430 case ISD::SELECT_CC:
6431 return LowerSELECT_CC(Op, DAG);
6432 case ISD::JumpTable:
6433 return LowerJumpTable(Op, DAG);
6434 case ISD::BR_JT:
6435 return LowerBR_JT(Op, DAG);
6436 case ISD::ConstantPool:
6437 return LowerConstantPool(Op, DAG);
6438 case ISD::BlockAddress:
6439 return LowerBlockAddress(Op, DAG);
6440 case ISD::VASTART:
6441 return LowerVASTART(Op, DAG);
6442 case ISD::VACOPY:
6443 return LowerVACOPY(Op, DAG);
6444 case ISD::VAARG:
6445 return LowerVAARG(Op, DAG);
6446 case ISD::UADDO_CARRY:
6447 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6448 case ISD::USUBO_CARRY:
6449 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6450 case ISD::SADDO_CARRY:
6451 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6452 case ISD::SSUBO_CARRY:
6453 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6454 case ISD::SADDO:
6455 case ISD::UADDO:
6456 case ISD::SSUBO:
6457 case ISD::USUBO:
6458 case ISD::SMULO:
6459 case ISD::UMULO:
6460 return LowerXALUO(Op, DAG);
6461 case ISD::FADD:
6462 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6463 case ISD::FSUB:
6464 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6465 case ISD::FMUL:
6466 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6467 case ISD::FMA:
6468 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6469 case ISD::FDIV:
6470 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6471 case ISD::FNEG:
6472 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6473 case ISD::FCEIL:
6474 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6475 case ISD::FFLOOR:
6476 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6477 case ISD::FNEARBYINT:
6478 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6479 case ISD::FRINT:
6480 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6481 case ISD::FROUND:
6482 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6483 case ISD::FROUNDEVEN:
6484 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6485 case ISD::FTRUNC:
6486 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6487 case ISD::FSQRT:
6488 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6489 case ISD::FABS:
6490 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6491 case ISD::FP_ROUND:
6493 return LowerFP_ROUND(Op, DAG);
6494 case ISD::FP_EXTEND:
6495 return LowerFP_EXTEND(Op, DAG);
6496 case ISD::FRAMEADDR:
6497 return LowerFRAMEADDR(Op, DAG);
6498 case ISD::SPONENTRY:
6499 return LowerSPONENTRY(Op, DAG);
6500 case ISD::RETURNADDR:
6501 return LowerRETURNADDR(Op, DAG);
6503 return LowerADDROFRETURNADDR(Op, DAG);
6505 return LowerCONCAT_VECTORS(Op, DAG);
6507 return LowerINSERT_VECTOR_ELT(Op, DAG);
6509 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6510 case ISD::BUILD_VECTOR:
6511 return LowerBUILD_VECTOR(Op, DAG);
6513 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6515 return LowerVECTOR_SHUFFLE(Op, DAG);
6516 case ISD::SPLAT_VECTOR:
6517 return LowerSPLAT_VECTOR(Op, DAG);
6519 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6521 return LowerINSERT_SUBVECTOR(Op, DAG);
6522 case ISD::SDIV:
6523 case ISD::UDIV:
6524 return LowerDIV(Op, DAG);
6525 case ISD::SMIN:
6526 case ISD::UMIN:
6527 case ISD::SMAX:
6528 case ISD::UMAX:
6529 return LowerMinMax(Op, DAG);
6530 case ISD::SRA:
6531 case ISD::SRL:
6532 case ISD::SHL:
6533 return LowerVectorSRA_SRL_SHL(Op, DAG);
6534 case ISD::SHL_PARTS:
6535 case ISD::SRL_PARTS:
6536 case ISD::SRA_PARTS:
6537 return LowerShiftParts(Op, DAG);
6538 case ISD::CTPOP:
6539 case ISD::PARITY:
6540 return LowerCTPOP_PARITY(Op, DAG);
6541 case ISD::FCOPYSIGN:
6542 return LowerFCOPYSIGN(Op, DAG);
6543 case ISD::OR:
6544 return LowerVectorOR(Op, DAG);
6545 case ISD::XOR:
6546 return LowerXOR(Op, DAG);
6547 case ISD::PREFETCH:
6548 return LowerPREFETCH(Op, DAG);
6549 case ISD::SINT_TO_FP:
6550 case ISD::UINT_TO_FP:
6553 return LowerINT_TO_FP(Op, DAG);
6554 case ISD::FP_TO_SINT:
6555 case ISD::FP_TO_UINT:
6558 return LowerFP_TO_INT(Op, DAG);
6561 return LowerFP_TO_INT_SAT(Op, DAG);
6562 case ISD::FSINCOS:
6563 return LowerFSINCOS(Op, DAG);
6564 case ISD::GET_ROUNDING:
6565 return LowerGET_ROUNDING(Op, DAG);
6566 case ISD::SET_ROUNDING:
6567 return LowerSET_ROUNDING(Op, DAG);
6568 case ISD::GET_FPMODE:
6569 return LowerGET_FPMODE(Op, DAG);
6570 case ISD::SET_FPMODE:
6571 return LowerSET_FPMODE(Op, DAG);
6572 case ISD::RESET_FPMODE:
6573 return LowerRESET_FPMODE(Op, DAG);
6574 case ISD::MUL:
6575 return LowerMUL(Op, DAG);
6576 case ISD::MULHS:
6577 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6578 case ISD::MULHU:
6579 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6581 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6583 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6585 return LowerINTRINSIC_VOID(Op, DAG);
6586 case ISD::ATOMIC_STORE:
6587 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6588 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6589 return LowerStore128(Op, DAG);
6590 }
6591 return SDValue();
6592 case ISD::STORE:
6593 return LowerSTORE(Op, DAG);
6594 case ISD::MSTORE:
6595 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6596 case ISD::MGATHER:
6597 return LowerMGATHER(Op, DAG);
6598 case ISD::MSCATTER:
6599 return LowerMSCATTER(Op, DAG);
6601 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6602 case ISD::VECREDUCE_ADD:
6603 case ISD::VECREDUCE_AND:
6604 case ISD::VECREDUCE_OR:
6605 case ISD::VECREDUCE_XOR:
6615 return LowerVECREDUCE(Op, DAG);
6617 return LowerATOMIC_LOAD_AND(Op, DAG);
6619 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6620 case ISD::VSCALE:
6621 return LowerVSCALE(Op, DAG);
6622 case ISD::ANY_EXTEND:
6623 case ISD::SIGN_EXTEND:
6624 case ISD::ZERO_EXTEND:
6625 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6627 // Only custom lower when ExtraVT has a legal byte based element type.
6628 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6629 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6630 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6631 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6632 return SDValue();
6633
6634 return LowerToPredicatedOp(Op, DAG,
6636 }
6637 case ISD::TRUNCATE:
6638 return LowerTRUNCATE(Op, DAG);
6639 case ISD::MLOAD:
6640 return LowerMLOAD(Op, DAG);
6641 case ISD::LOAD:
6642 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6643 !Subtarget->isNeonAvailable()))
6644 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6645 return LowerLOAD(Op, DAG);
6646 case ISD::ADD:
6647 case ISD::AND:
6648 case ISD::SUB:
6649 return LowerToScalableOp(Op, DAG);
6650 case ISD::FMAXIMUM:
6651 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6652 case ISD::FMAXNUM:
6653 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6654 case ISD::FMINIMUM:
6655 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6656 case ISD::FMINNUM:
6657 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6658 case ISD::VSELECT:
6659 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6660 case ISD::ABS:
6661 return LowerABS(Op, DAG);
6662 case ISD::ABDS:
6663 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6664 case ISD::ABDU:
6665 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6666 case ISD::AVGFLOORS:
6667 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6668 case ISD::AVGFLOORU:
6669 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6670 case ISD::AVGCEILS:
6671 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6672 case ISD::AVGCEILU:
6673 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6674 case ISD::BITREVERSE:
6675 return LowerBitreverse(Op, DAG);
6676 case ISD::BSWAP:
6677 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6678 case ISD::CTLZ:
6679 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6680 case ISD::CTTZ:
6681 return LowerCTTZ(Op, DAG);
6682 case ISD::VECTOR_SPLICE:
6683 return LowerVECTOR_SPLICE(Op, DAG);
6685 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6687 return LowerVECTOR_INTERLEAVE(Op, DAG);
6688 case ISD::LROUND:
6689 case ISD::LLROUND:
6690 case ISD::LRINT:
6691 case ISD::LLRINT: {
6692 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6693 Op.getOperand(0).getValueType() == MVT::bf16) &&
6694 "Expected custom lowering of rounding operations only for f16");
6695 SDLoc DL(Op);
6696 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6697 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6698 }
6699 case ISD::STRICT_LROUND:
6701 case ISD::STRICT_LRINT:
6702 case ISD::STRICT_LLRINT: {
6703 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6704 Op.getOperand(1).getValueType() == MVT::bf16) &&
6705 "Expected custom lowering of rounding operations only for f16");
6706 SDLoc DL(Op);
6707 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6708 {Op.getOperand(0), Op.getOperand(1)});
6709 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6710 {Ext.getValue(1), Ext.getValue(0)});
6711 }
6712 case ISD::WRITE_REGISTER: {
6713 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6714 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6715 SDLoc DL(Op);
6716
6717 SDValue Chain = Op.getOperand(0);
6718 SDValue SysRegName = Op.getOperand(1);
6719 std::pair<SDValue, SDValue> Pair =
6720 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6721
6722 // chain = MSRR(chain, sysregname, lo, hi)
6723 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6724 SysRegName, Pair.first, Pair.second);
6725
6726 return Result;
6727 }
6728 case ISD::FSHL:
6729 case ISD::FSHR:
6730 return LowerFunnelShift(Op, DAG);
6731 case ISD::FLDEXP:
6732 return LowerFLDEXP(Op, DAG);
6733 }
6734}
6735
6737 return !Subtarget->useSVEForFixedLengthVectors();
6738}
6739
6741 EVT VT, bool OverrideNEON) const {
6742 if (!VT.isFixedLengthVector() || !VT.isSimple())
6743 return false;
6744
6745 // Don't use SVE for vectors we cannot scalarize if required.
6746 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6747 // Fixed length predicates should be promoted to i8.
6748 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6749 case MVT::i1:
6750 default:
6751 return false;
6752 case MVT::i8:
6753 case MVT::i16:
6754 case MVT::i32:
6755 case MVT::i64:
6756 case MVT::f16:
6757 case MVT::f32:
6758 case MVT::f64:
6759 break;
6760 }
6761
6762 // NEON-sized vectors can be emulated using SVE instructions.
6763 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6764 return Subtarget->hasSVEorSME();
6765
6766 // Ensure NEON MVTs only belong to a single register class.
6767 if (VT.getFixedSizeInBits() <= 128)
6768 return false;
6769
6770 // Ensure wider than NEON code generation is enabled.
6771 if (!Subtarget->useSVEForFixedLengthVectors())
6772 return false;
6773
6774 // Don't use SVE for types that don't fit.
6775 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6776 return false;
6777
6778 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6779 // the base fixed length SVE support in place.
6780 if (!VT.isPow2VectorType())
6781 return false;
6782
6783 return true;
6784}
6785
6786//===----------------------------------------------------------------------===//
6787// Calling Convention Implementation
6788//===----------------------------------------------------------------------===//
6789
6790static unsigned getIntrinsicID(const SDNode *N) {
6791 unsigned Opcode = N->getOpcode();
6792 switch (Opcode) {
6793 default:
6796 unsigned IID = N->getConstantOperandVal(0);
6797 if (IID < Intrinsic::num_intrinsics)
6798 return IID;
6800 }
6801 }
6802}
6803
6805 SDValue N1) const {
6806 if (!N0.hasOneUse())
6807 return false;
6808
6809 unsigned IID = getIntrinsicID(N1.getNode());
6810 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6811 if (IID == Intrinsic::aarch64_neon_umull ||
6812 N1.getOpcode() == AArch64ISD::UMULL ||
6813 IID == Intrinsic::aarch64_neon_smull ||
6815 return N0.getOpcode() != ISD::ADD;
6816
6817 return true;
6818}
6819
6820/// Selects the correct CCAssignFn for a given CallingConvention value.
6822 bool IsVarArg) const {
6823 switch (CC) {
6824 default:
6825 report_fatal_error("Unsupported calling convention.");
6826 case CallingConv::GHC:
6827 return CC_AArch64_GHC;
6828 case CallingConv::C:
6829 case CallingConv::Fast:
6833 case CallingConv::Swift:
6835 case CallingConv::Tail:
6836 case CallingConv::GRAAL:
6837 if (Subtarget->isTargetWindows()) {
6838 if (IsVarArg) {
6839 if (Subtarget->isWindowsArm64EC())
6842 }
6843 return CC_AArch64_Win64PCS;
6844 }
6845 if (!Subtarget->isTargetDarwin())
6846 return CC_AArch64_AAPCS;
6847 if (!IsVarArg)
6848 return CC_AArch64_DarwinPCS;
6851 case CallingConv::Win64:
6852 if (IsVarArg) {
6853 if (Subtarget->isWindowsArm64EC())
6856 }
6857 return CC_AArch64_Win64PCS;
6859 if (Subtarget->isWindowsArm64EC())
6866 return CC_AArch64_AAPCS;
6871 }
6872}
6873
6874CCAssignFn *
6876 switch (CC) {
6877 default:
6878 return RetCC_AArch64_AAPCS;
6882 if (Subtarget->isWindowsArm64EC())
6884 return RetCC_AArch64_AAPCS;
6885 }
6886}
6887
6888
6889unsigned
6890AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6891 SelectionDAG &DAG) const {
6893 MachineFrameInfo &MFI = MF.getFrameInfo();
6894
6895 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6896 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6897 DAG.getConstant(1, DL, MVT::i32));
6898 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6899 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6900 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6901 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6902 Chain = Buffer.getValue(1);
6903 MFI.CreateVariableSizedObject(Align(1), nullptr);
6904
6905 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6906 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6907
6908 // Store the buffer pointer to the TPIDR2 stack object.
6911 TPIDR2Obj,
6913 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6914
6915 // Set the reserved bytes (10-15) to zero
6916 EVT PtrTy = Ptr.getValueType();
6917 SDValue ReservedPtr =
6918 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6919 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6920 MPI);
6921 ReservedPtr =
6922 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6923 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6924 MPI);
6925
6926 return TPIDR2Obj;
6927}
6928
6929static bool isPassedInFPR(EVT VT) {
6930 return VT.isFixedLengthVector() ||
6931 (VT.isFloatingPoint() && !VT.isScalableVector());
6932}
6933
6934SDValue AArch64TargetLowering::LowerFormalArguments(
6935 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6936 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6937 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6939 const Function &F = MF.getFunction();
6940 MachineFrameInfo &MFI = MF.getFrameInfo();
6941 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6942 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6943 (isVarArg && Subtarget->isWindowsArm64EC());
6945
6947 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6949 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6950 FuncInfo->setIsSVECC(true);
6951
6952 // Assign locations to all of the incoming arguments.
6954 DenseMap<unsigned, SDValue> CopiedRegs;
6955 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6956
6957 // At this point, Ins[].VT may already be promoted to i32. To correctly
6958 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6959 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6960 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6961 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6962 // LocVT.
6963 unsigned NumArgs = Ins.size();
6964 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6965 unsigned CurArgIdx = 0;
6966 for (unsigned i = 0; i != NumArgs; ++i) {
6967 MVT ValVT = Ins[i].VT;
6968 if (Ins[i].isOrigArg()) {
6969 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6970 CurArgIdx = Ins[i].getOrigArgIndex();
6971
6972 // Get type of the original argument.
6973 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6974 /*AllowUnknown*/ true);
6975 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6976 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6977 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6978 ValVT = MVT::i8;
6979 else if (ActualMVT == MVT::i16)
6980 ValVT = MVT::i16;
6981 }
6982 bool UseVarArgCC = false;
6983 if (IsWin64)
6984 UseVarArgCC = isVarArg;
6985 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6986 bool Res =
6987 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6988 assert(!Res && "Call operand has unhandled type");
6989 (void)Res;
6990 }
6991
6993 bool IsLocallyStreaming =
6994 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6995 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6996 SDValue Glue = Chain.getValue(1);
6997
6998 SmallVector<SDValue, 16> ArgValues;
6999 unsigned ExtraArgLocs = 0;
7000 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7001 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7002
7003 if (Ins[i].Flags.isByVal()) {
7004 // Byval is used for HFAs in the PCS, but the system should work in a
7005 // non-compliant manner for larger structs.
7006 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7007 int Size = Ins[i].Flags.getByValSize();
7008 unsigned NumRegs = (Size + 7) / 8;
7009
7010 // FIXME: This works on big-endian for composite byvals, which are the common
7011 // case. It should also work for fundamental types too.
7012 unsigned FrameIdx =
7013 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7014 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7015 InVals.push_back(FrameIdxN);
7016
7017 continue;
7018 }
7019
7020 if (Ins[i].Flags.isSwiftAsync())
7022
7023 SDValue ArgValue;
7024 if (VA.isRegLoc()) {
7025 // Arguments stored in registers.
7026 EVT RegVT = VA.getLocVT();
7027 const TargetRegisterClass *RC;
7028
7029 if (RegVT == MVT::i32)
7030 RC = &AArch64::GPR32RegClass;
7031 else if (RegVT == MVT::i64)
7032 RC = &AArch64::GPR64RegClass;
7033 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7034 RC = &AArch64::FPR16RegClass;
7035 else if (RegVT == MVT::f32)
7036 RC = &AArch64::FPR32RegClass;
7037 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7038 RC = &AArch64::FPR64RegClass;
7039 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7040 RC = &AArch64::FPR128RegClass;
7041 else if (RegVT.isScalableVector() &&
7042 RegVT.getVectorElementType() == MVT::i1) {
7043 FuncInfo->setIsSVECC(true);
7044 RC = &AArch64::PPRRegClass;
7045 } else if (RegVT == MVT::aarch64svcount) {
7046 FuncInfo->setIsSVECC(true);
7047 RC = &AArch64::PPRRegClass;
7048 } else if (RegVT.isScalableVector()) {
7049 FuncInfo->setIsSVECC(true);
7050 RC = &AArch64::ZPRRegClass;
7051 } else
7052 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7053
7054 // Transform the arguments in physical registers into virtual ones.
7055 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7056
7057 if (IsLocallyStreaming) {
7058 // LocallyStreamingFunctions must insert the SMSTART in the correct
7059 // position, so we use Glue to ensure no instructions can be scheduled
7060 // between the chain of:
7061 // t0: ch,glue = EntryNode
7062 // t1: res,ch,glue = CopyFromReg
7063 // ...
7064 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7065 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7066 // ^^^^^^
7067 // This will be the new Chain/Root node.
7068 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7069 Glue = ArgValue.getValue(2);
7070 if (isPassedInFPR(ArgValue.getValueType())) {
7071 ArgValue =
7073 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7074 {ArgValue, Glue});
7075 Glue = ArgValue.getValue(1);
7076 }
7077 } else
7078 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7079
7080 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7081 // to 64 bits. Insert an assert[sz]ext to capture this, then
7082 // truncate to the right size.
7083 switch (VA.getLocInfo()) {
7084 default:
7085 llvm_unreachable("Unknown loc info!");
7086 case CCValAssign::Full:
7087 break;
7089 assert(
7090 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7091 "Indirect arguments should be scalable on most subtargets");
7092 break;
7093 case CCValAssign::BCvt:
7094 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7095 break;
7096 case CCValAssign::AExt:
7097 case CCValAssign::SExt:
7098 case CCValAssign::ZExt:
7099 break;
7101 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7102 DAG.getConstant(32, DL, RegVT));
7103 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7104 break;
7105 }
7106 } else { // VA.isRegLoc()
7107 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7108 unsigned ArgOffset = VA.getLocMemOffset();
7109 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7110 ? VA.getLocVT().getSizeInBits()
7111 : VA.getValVT().getSizeInBits()) / 8;
7112
7113 uint32_t BEAlign = 0;
7114 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7115 !Ins[i].Flags.isInConsecutiveRegs())
7116 BEAlign = 8 - ArgSize;
7117
7118 SDValue FIN;
7119 MachinePointerInfo PtrInfo;
7120 if (StackViaX4) {
7121 // In both the ARM64EC varargs convention and the thunk convention,
7122 // arguments on the stack are accessed relative to x4, not sp. In
7123 // the thunk convention, there's an additional offset of 32 bytes
7124 // to account for the shadow store.
7125 unsigned ObjOffset = ArgOffset + BEAlign;
7126 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7127 ObjOffset += 32;
7128 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7129 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7130 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7131 DAG.getConstant(ObjOffset, DL, MVT::i64));
7133 } else {
7134 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7135
7136 // Create load nodes to retrieve arguments from the stack.
7137 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7138 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7139 }
7140
7141 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7143 MVT MemVT = VA.getValVT();
7144
7145 switch (VA.getLocInfo()) {
7146 default:
7147 break;
7148 case CCValAssign::Trunc:
7149 case CCValAssign::BCvt:
7150 MemVT = VA.getLocVT();
7151 break;
7154 Subtarget->isWindowsArm64EC()) &&
7155 "Indirect arguments should be scalable on most subtargets");
7156 MemVT = VA.getLocVT();
7157 break;
7158 case CCValAssign::SExt:
7159 ExtType = ISD::SEXTLOAD;
7160 break;
7161 case CCValAssign::ZExt:
7162 ExtType = ISD::ZEXTLOAD;
7163 break;
7164 case CCValAssign::AExt:
7165 ExtType = ISD::EXTLOAD;
7166 break;
7167 }
7168
7169 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7170 MemVT);
7171 }
7172
7173 if (VA.getLocInfo() == CCValAssign::Indirect) {
7174 assert((VA.getValVT().isScalableVT() ||
7175 Subtarget->isWindowsArm64EC()) &&
7176 "Indirect arguments should be scalable on most subtargets");
7177
7178 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7179 unsigned NumParts = 1;
7180 if (Ins[i].Flags.isInConsecutiveRegs()) {
7181 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7182 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7183 ++NumParts;
7184 }
7185
7186 MVT PartLoad = VA.getValVT();
7187 SDValue Ptr = ArgValue;
7188
7189 // Ensure we generate all loads for each tuple part, whilst updating the
7190 // pointer after each load correctly using vscale.
7191 while (NumParts > 0) {
7192 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7193 InVals.push_back(ArgValue);
7194 NumParts--;
7195 if (NumParts > 0) {
7196 SDValue BytesIncrement;
7197 if (PartLoad.isScalableVector()) {
7198 BytesIncrement = DAG.getVScale(
7199 DL, Ptr.getValueType(),
7200 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7201 } else {
7202 BytesIncrement = DAG.getConstant(
7203 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7204 Ptr.getValueType());
7205 }
7207 Flags.setNoUnsignedWrap(true);
7208 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7209 BytesIncrement, Flags);
7210 ExtraArgLocs++;
7211 i++;
7212 }
7213 }
7214 } else {
7215 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7216 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7217 ArgValue, DAG.getValueType(MVT::i32));
7218
7219 // i1 arguments are zero-extended to i8 by the caller. Emit a
7220 // hint to reflect this.
7221 if (Ins[i].isOrigArg()) {
7222 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7223 if (OrigArg->getType()->isIntegerTy(1)) {
7224 if (!Ins[i].Flags.isZExt()) {
7225 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7226 ArgValue.getValueType(), ArgValue);
7227 }
7228 }
7229 }
7230
7231 InVals.push_back(ArgValue);
7232 }
7233 }
7234 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7235
7236 // Insert the SMSTART if this is a locally streaming function and
7237 // make sure it is Glued to the last CopyFromReg value.
7238 if (IsLocallyStreaming) {
7239 SDValue PStateSM;
7240 if (Attrs.hasStreamingCompatibleInterface()) {
7241 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7244 FuncInfo->setPStateSMReg(Reg);
7245 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7246 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7248 } else
7249 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7251
7252 // Ensure that the SMSTART happens after the CopyWithChain such that its
7253 // chain result is used.
7254 for (unsigned I=0; I<InVals.size(); ++I) {
7256 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7257 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7258 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7259 InVals[I].getValueType());
7260 }
7261 }
7262
7263 // varargs
7264 if (isVarArg) {
7265 if (!Subtarget->isTargetDarwin() || IsWin64) {
7266 // The AAPCS variadic function ABI is identical to the non-variadic
7267 // one. As a result there may be more arguments in registers and we should
7268 // save them for future reference.
7269 // Win64 variadic functions also pass arguments in registers, but all float
7270 // arguments are passed in integer registers.
7271 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7272 }
7273
7274 // This will point to the next argument passed via stack.
7275 unsigned VarArgsOffset = CCInfo.getStackSize();
7276 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7277 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7278 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7279 FuncInfo->setVarArgsStackIndex(
7280 MFI.CreateFixedObject(4, VarArgsOffset, true));
7281
7282 if (MFI.hasMustTailInVarArgFunc()) {
7283 SmallVector<MVT, 2> RegParmTypes;
7284 RegParmTypes.push_back(MVT::i64);
7285 RegParmTypes.push_back(MVT::f128);
7286 // Compute the set of forwarded registers. The rest are scratch.
7288 FuncInfo->getForwardedMustTailRegParms();
7289 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7291
7292 // Conservatively forward X8, since it might be used for aggregate return.
7293 if (!CCInfo.isAllocated(AArch64::X8)) {
7294 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7295 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7296 }
7297 }
7298 }
7299
7300 // On Windows, InReg pointers must be returned, so record the pointer in a
7301 // virtual register at the start of the function so it can be returned in the
7302 // epilogue.
7303 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7304 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7305 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7306 Ins[I].Flags.isInReg()) &&
7307 Ins[I].Flags.isSRet()) {
7308 assert(!FuncInfo->getSRetReturnReg());
7309
7310 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7311 Register Reg =
7313 FuncInfo->setSRetReturnReg(Reg);
7314
7315 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7316 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7317 break;
7318 }
7319 }
7320 }
7321
7322 unsigned StackArgSize = CCInfo.getStackSize();
7323 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7324 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7325 // This is a non-standard ABI so by fiat I say we're allowed to make full
7326 // use of the stack area to be popped, which must be aligned to 16 bytes in
7327 // any case:
7328 StackArgSize = alignTo(StackArgSize, 16);
7329
7330 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7331 // a multiple of 16.
7332 FuncInfo->setArgumentStackToRestore(StackArgSize);
7333
7334 // This realignment carries over to the available bytes below. Our own
7335 // callers will guarantee the space is free by giving an aligned value to
7336 // CALLSEQ_START.
7337 }
7338 // Even if we're not expected to free up the space, it's useful to know how
7339 // much is there while considering tail calls (because we can reuse it).
7340 FuncInfo->setBytesInStackArgArea(StackArgSize);
7341
7342 if (Subtarget->hasCustomCallingConv())
7344
7345 // Conservatively assume the function requires the lazy-save mechanism.
7346 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7347 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7348 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7349 }
7350
7351 return Chain;
7352}
7353
7354void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7355 SelectionDAG &DAG,
7356 const SDLoc &DL,
7357 SDValue &Chain) const {
7359 MachineFrameInfo &MFI = MF.getFrameInfo();
7361 auto PtrVT = getPointerTy(DAG.getDataLayout());
7362 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7363
7365
7367 unsigned NumGPRArgRegs = GPRArgRegs.size();
7368 if (Subtarget->isWindowsArm64EC()) {
7369 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7370 // functions.
7371 NumGPRArgRegs = 4;
7372 }
7373 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7374
7375 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7376 int GPRIdx = 0;
7377 if (GPRSaveSize != 0) {
7378 if (IsWin64) {
7379 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7380 if (GPRSaveSize & 15)
7381 // The extra size here, if triggered, will always be 8.
7382 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7383 } else
7384 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7385
7386 SDValue FIN;
7387 if (Subtarget->isWindowsArm64EC()) {
7388 // With the Arm64EC ABI, we reserve the save area as usual, but we
7389 // compute its address relative to x4. For a normal AArch64->AArch64
7390 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7391 // different address.
7392 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7393 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7394 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7395 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7396 } else {
7397 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7398 }
7399
7400 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7401 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7402 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7403 SDValue Store =
7404 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7406 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7407 : MachinePointerInfo::getStack(MF, i * 8));
7408 MemOps.push_back(Store);
7409 FIN =
7410 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7411 }
7412 }
7413 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7414 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7415
7416 if (Subtarget->hasFPARMv8() && !IsWin64) {
7418 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7419 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7420
7421 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7422 int FPRIdx = 0;
7423 if (FPRSaveSize != 0) {
7424 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7425
7426 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7427
7428 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7429 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7430 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7431
7432 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7433 MachinePointerInfo::getStack(MF, i * 16));
7434 MemOps.push_back(Store);
7435 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7436 DAG.getConstant(16, DL, PtrVT));
7437 }
7438 }
7439 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7440 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7441 }
7442
7443 if (!MemOps.empty()) {
7444 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7445 }
7446}
7447
7448/// LowerCallResult - Lower the result values of a call into the
7449/// appropriate copies out of appropriate physical registers.
7450SDValue AArch64TargetLowering::LowerCallResult(
7451 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7452 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7453 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7454 SDValue ThisVal, bool RequiresSMChange) const {
7455 DenseMap<unsigned, SDValue> CopiedRegs;
7456 // Copy all of the result registers out of their specified physreg.
7457 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7458 CCValAssign VA = RVLocs[i];
7459
7460 // Pass 'this' value directly from the argument to return value, to avoid
7461 // reg unit interference
7462 if (i == 0 && isThisReturn) {
7463 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7464 "unexpected return calling convention register assignment");
7465 InVals.push_back(ThisVal);
7466 continue;
7467 }
7468
7469 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7470 // allows one use of a physreg per block.
7471 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7472 if (!Val) {
7473 Val =
7474 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7475 Chain = Val.getValue(1);
7476 InGlue = Val.getValue(2);
7477 CopiedRegs[VA.getLocReg()] = Val;
7478 }
7479
7480 switch (VA.getLocInfo()) {
7481 default:
7482 llvm_unreachable("Unknown loc info!");
7483 case CCValAssign::Full:
7484 break;
7485 case CCValAssign::BCvt:
7486 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7487 break;
7489 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7490 DAG.getConstant(32, DL, VA.getLocVT()));
7491 [[fallthrough]];
7492 case CCValAssign::AExt:
7493 [[fallthrough]];
7494 case CCValAssign::ZExt:
7495 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7496 break;
7497 }
7498
7499 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7501 Val);
7502
7503 InVals.push_back(Val);
7504 }
7505
7506 return Chain;
7507}
7508
7509/// Return true if the calling convention is one that we can guarantee TCO for.
7510static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7511 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7513}
7514
7515/// Return true if we might ever do TCO for calls with this calling convention.
7517 switch (CC) {
7518 case CallingConv::C:
7522 case CallingConv::Swift:
7524 case CallingConv::Tail:
7525 case CallingConv::Fast:
7526 return true;
7527 default:
7528 return false;
7529 }
7530}
7531
7533 const AArch64Subtarget *Subtarget,
7535 CCState &CCInfo) {
7536 const SelectionDAG &DAG = CLI.DAG;
7537 CallingConv::ID CalleeCC = CLI.CallConv;
7538 bool IsVarArg = CLI.IsVarArg;
7539 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7540 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7541
7542 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7543 // for the shadow store.
7544 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7545 CCInfo.AllocateStack(32, Align(16));
7546
7547 unsigned NumArgs = Outs.size();
7548 for (unsigned i = 0; i != NumArgs; ++i) {
7549 MVT ArgVT = Outs[i].VT;
7550 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7551
7552 bool UseVarArgCC = false;
7553 if (IsVarArg) {
7554 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7555 // too, so use the vararg CC to force them to integer registers.
7556 if (IsCalleeWin64) {
7557 UseVarArgCC = true;
7558 } else {
7559 UseVarArgCC = !Outs[i].IsFixed;
7560 }
7561 }
7562
7563 if (!UseVarArgCC) {
7564 // Get type of the original argument.
7565 EVT ActualVT =
7566 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7567 /*AllowUnknown*/ true);
7568 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7569 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7570 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7571 ArgVT = MVT::i8;
7572 else if (ActualMVT == MVT::i16)
7573 ArgVT = MVT::i16;
7574 }
7575
7576 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7577 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7578 assert(!Res && "Call operand has unhandled type");
7579 (void)Res;
7580 }
7581}
7582
7583bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7584 const CallLoweringInfo &CLI) const {
7585 CallingConv::ID CalleeCC = CLI.CallConv;
7586 if (!mayTailCallThisCC(CalleeCC))
7587 return false;
7588
7589 SDValue Callee = CLI.Callee;
7590 bool IsVarArg = CLI.IsVarArg;
7591 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7592 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7593 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7594 const SelectionDAG &DAG = CLI.DAG;
7596 const Function &CallerF = MF.getFunction();
7597 CallingConv::ID CallerCC = CallerF.getCallingConv();
7598
7599 // SME Streaming functions are not eligible for TCO as they may require
7600 // the streaming mode or ZA to be restored after returning from the call.
7601 SMEAttrs CallerAttrs(MF.getFunction());
7602 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7603 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7604 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7605 CallerAttrs.hasStreamingBody())
7606 return false;
7607
7608 // Functions using the C or Fast calling convention that have an SVE signature
7609 // preserve more registers and should assume the SVE_VectorCall CC.
7610 // The check for matching callee-saved regs will determine whether it is
7611 // eligible for TCO.
7612 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7615
7616 bool CCMatch = CallerCC == CalleeCC;
7617
7618 // When using the Windows calling convention on a non-windows OS, we want
7619 // to back up and restore X18 in such functions; we can't do a tail call
7620 // from those functions.
7621 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7622 CalleeCC != CallingConv::Win64)
7623 return false;
7624
7625 // Byval parameters hand the function a pointer directly into the stack area
7626 // we want to reuse during a tail call. Working around this *is* possible (see
7627 // X86) but less efficient and uglier in LowerCall.
7628 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7629 e = CallerF.arg_end();
7630 i != e; ++i) {
7631 if (i->hasByValAttr())
7632 return false;
7633
7634 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7635 // In this case, it is necessary to save/restore X0 in the callee. Tail
7636 // call opt interferes with this. So we disable tail call opt when the
7637 // caller has an argument with "inreg" attribute.
7638
7639 // FIXME: Check whether the callee also has an "inreg" argument.
7640 if (i->hasInRegAttr())
7641 return false;
7642 }
7643
7644 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7645 return CCMatch;
7646
7647 // Externally-defined functions with weak linkage should not be
7648 // tail-called on AArch64 when the OS does not support dynamic
7649 // pre-emption of symbols, as the AAELF spec requires normal calls
7650 // to undefined weak functions to be replaced with a NOP or jump to the
7651 // next instruction. The behaviour of branch instructions in this
7652 // situation (as used for tail calls) is implementation-defined, so we
7653 // cannot rely on the linker replacing the tail call with a return.
7654 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7655 const GlobalValue *GV = G->getGlobal();
7657 if (GV->hasExternalWeakLinkage() &&
7658 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7659 return false;
7660 }
7661
7662 // Now we search for cases where we can use a tail call without changing the
7663 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7664 // concept.
7665
7666 // I want anyone implementing a new calling convention to think long and hard
7667 // about this assert.
7668 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7669 "Unexpected variadic calling convention");
7670
7671 LLVMContext &C = *DAG.getContext();
7672 // Check that the call results are passed in the same way.
7673 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7674 CCAssignFnForCall(CalleeCC, IsVarArg),
7675 CCAssignFnForCall(CallerCC, IsVarArg)))
7676 return false;
7677 // The callee has to preserve all registers the caller needs to preserve.
7678 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7679 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7680 if (!CCMatch) {
7681 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7682 if (Subtarget->hasCustomCallingConv()) {
7683 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7684 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7685 }
7686 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7687 return false;
7688 }
7689
7690 // Nothing more to check if the callee is taking no arguments
7691 if (Outs.empty())
7692 return true;
7693
7695 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7696
7697 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7698
7699 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7700 // When we are musttail, additional checks have been done and we can safely ignore this check
7701 // At least two cases here: if caller is fastcc then we can't have any
7702 // memory arguments (we'd be expected to clean up the stack afterwards). If
7703 // caller is C then we could potentially use its argument area.
7704
7705 // FIXME: for now we take the most conservative of these in both cases:
7706 // disallow all variadic memory operands.
7707 for (const CCValAssign &ArgLoc : ArgLocs)
7708 if (!ArgLoc.isRegLoc())
7709 return false;
7710 }
7711
7712 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7713
7714 // If any of the arguments is passed indirectly, it must be SVE, so the
7715 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7716 // allocate space on the stack. That is why we determine this explicitly here
7717 // the call cannot be a tailcall.
7718 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7719 assert((A.getLocInfo() != CCValAssign::Indirect ||
7720 A.getValVT().isScalableVector() ||
7721 Subtarget->isWindowsArm64EC()) &&
7722 "Expected value to be scalable");
7723 return A.getLocInfo() == CCValAssign::Indirect;
7724 }))
7725 return false;
7726
7727 // If the stack arguments for this call do not fit into our own save area then
7728 // the call cannot be made tail.
7729 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7730 return false;
7731
7732 const MachineRegisterInfo &MRI = MF.getRegInfo();
7733 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7734 return false;
7735
7736 return true;
7737}
7738
7739SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7740 SelectionDAG &DAG,
7741 MachineFrameInfo &MFI,
7742 int ClobberedFI) const {
7743 SmallVector<SDValue, 8> ArgChains;
7744 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7745 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7746
7747 // Include the original chain at the beginning of the list. When this is
7748 // used by target LowerCall hooks, this helps legalize find the
7749 // CALLSEQ_BEGIN node.
7750 ArgChains.push_back(Chain);
7751
7752 // Add a chain value for each stack argument corresponding
7753 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7754 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7755 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7756 if (FI->getIndex() < 0) {
7757 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7758 int64_t InLastByte = InFirstByte;
7759 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7760
7761 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7762 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7763 ArgChains.push_back(SDValue(L, 1));
7764 }
7765
7766 // Build a tokenfactor for all the chains.
7767 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7768}
7769
7770bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7771 bool TailCallOpt) const {
7772 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7773 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7774}
7775
7776// Check if the value is zero-extended from i1 to i8
7777static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7778 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7779 if (SizeInBits < 8)
7780 return false;
7781
7782 APInt RequredZero(SizeInBits, 0xFE);
7783 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7784 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7785 return ZExtBool;
7786}
7787
7788void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7789 SDNode *Node) const {
7790 // Live-in physreg copies that are glued to SMSTART are applied as
7791 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7792 // register allocator to pass call args in callee saved regs, without extra
7793 // copies to avoid these fake clobbers of actually-preserved GPRs.
7794 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7795 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7796 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7797 if (MachineOperand &MO = MI.getOperand(I);
7798 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7799 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7800 AArch64::GPR64RegClass.contains(MO.getReg())))
7801 MI.removeOperand(I);
7802
7803 // The SVE vector length can change when entering/leaving streaming mode.
7804 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7805 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7806 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7807 /*IsImplicit=*/true));
7808 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7809 /*IsImplicit=*/true));
7810 }
7811 }
7812
7813 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7814 // have nothing to do with VG, were it not that they are used to materialise a
7815 // frame-address. If they contain a frame-index to a scalable vector, this
7816 // will likely require an ADDVL instruction to materialise the address, thus
7817 // reading VG.
7818 const MachineFunction &MF = *MI.getMF();
7820 (MI.getOpcode() == AArch64::ADDXri ||
7821 MI.getOpcode() == AArch64::SUBXri)) {
7822 const MachineOperand &MO = MI.getOperand(1);
7823 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7825 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7826 /*IsImplicit=*/true));
7827 }
7828}
7829
7831 bool Enable, SDValue Chain,
7832 SDValue InGlue,
7833 unsigned Condition,
7834 SDValue PStateSM) const {
7837 FuncInfo->setHasStreamingModeChanges(true);
7838
7839 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7840 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7841 SDValue MSROp =
7842 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7843 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7844 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7845 if (Condition != AArch64SME::Always) {
7846 assert(PStateSM && "PStateSM should be defined");
7847 Ops.push_back(PStateSM);
7848 }
7849 Ops.push_back(RegMask);
7850
7851 if (InGlue)
7852 Ops.push_back(InGlue);
7853
7854 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7855 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7856}
7857
7858static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7859 const SMEAttrs &CalleeAttrs) {
7860 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7861 CallerAttrs.hasStreamingBody())
7862 return AArch64SME::Always;
7863 if (CalleeAttrs.hasNonStreamingInterface())
7865 if (CalleeAttrs.hasStreamingInterface())
7867
7868 llvm_unreachable("Unsupported attributes");
7869}
7870
7871/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7872/// and add input and output parameter nodes.
7873SDValue
7874AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7875 SmallVectorImpl<SDValue> &InVals) const {
7876 SelectionDAG &DAG = CLI.DAG;
7877 SDLoc &DL = CLI.DL;
7878 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7879 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7881 SDValue Chain = CLI.Chain;
7882 SDValue Callee = CLI.Callee;
7883 bool &IsTailCall = CLI.IsTailCall;
7884 CallingConv::ID &CallConv = CLI.CallConv;
7885 bool IsVarArg = CLI.IsVarArg;
7886
7889 bool IsThisReturn = false;
7890
7892 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7893 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7894 bool IsSibCall = false;
7895 bool GuardWithBTI = false;
7896
7897 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7898 !Subtarget->noBTIAtReturnTwice()) {
7899 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7900 }
7901
7902 // Analyze operands of the call, assigning locations to each operand.
7904 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7905
7906 if (IsVarArg) {
7907 unsigned NumArgs = Outs.size();
7908
7909 for (unsigned i = 0; i != NumArgs; ++i) {
7910 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7911 report_fatal_error("Passing SVE types to variadic functions is "
7912 "currently not supported");
7913 }
7914 }
7915
7916 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7917
7918 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7919 // Assign locations to each value returned by this call.
7921 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7922 *DAG.getContext());
7923 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7924
7925 // Check callee args/returns for SVE registers and set calling convention
7926 // accordingly.
7927 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7928 auto HasSVERegLoc = [](CCValAssign &Loc) {
7929 if (!Loc.isRegLoc())
7930 return false;
7931 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7932 AArch64::PPRRegClass.contains(Loc.getLocReg());
7933 };
7934 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7936 }
7937
7938 if (IsTailCall) {
7939 // Check if it's really possible to do a tail call.
7940 IsTailCall = isEligibleForTailCallOptimization(CLI);
7941
7942 // A sibling call is one where we're under the usual C ABI and not planning
7943 // to change that but can still do a tail call:
7944 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7945 CallConv != CallingConv::SwiftTail)
7946 IsSibCall = true;
7947
7948 if (IsTailCall)
7949 ++NumTailCalls;
7950 }
7951
7952 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7953 report_fatal_error("failed to perform tail call elimination on a call "
7954 "site marked musttail");
7955
7956 // Get a count of how many bytes are to be pushed on the stack.
7957 unsigned NumBytes = CCInfo.getStackSize();
7958
7959 if (IsSibCall) {
7960 // Since we're not changing the ABI to make this a tail call, the memory
7961 // operands are already available in the caller's incoming argument space.
7962 NumBytes = 0;
7963 }
7964
7965 // FPDiff is the byte offset of the call's argument area from the callee's.
7966 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7967 // by this amount for a tail call. In a sibling call it must be 0 because the
7968 // caller will deallocate the entire stack and the callee still expects its
7969 // arguments to begin at SP+0. Completely unused for non-tail calls.
7970 int FPDiff = 0;
7971
7972 if (IsTailCall && !IsSibCall) {
7973 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7974
7975 // Since callee will pop argument stack as a tail call, we must keep the
7976 // popped size 16-byte aligned.
7977 NumBytes = alignTo(NumBytes, 16);
7978
7979 // FPDiff will be negative if this tail call requires more space than we
7980 // would automatically have in our incoming argument space. Positive if we
7981 // can actually shrink the stack.
7982 FPDiff = NumReusableBytes - NumBytes;
7983
7984 // Update the required reserved area if this is the tail call requiring the
7985 // most argument stack space.
7986 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7987 FuncInfo->setTailCallReservedStack(-FPDiff);
7988
7989 // The stack pointer must be 16-byte aligned at all times it's used for a
7990 // memory operation, which in practice means at *all* times and in
7991 // particular across call boundaries. Therefore our own arguments started at
7992 // a 16-byte aligned SP and the delta applied for the tail call should
7993 // satisfy the same constraint.
7994 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7995 }
7996
7997 // Determine whether we need any streaming mode changes.
7998 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7999 if (CLI.CB)
8000 CalleeAttrs = SMEAttrs(*CLI.CB);
8001 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8002 CalleeAttrs = SMEAttrs(ES->getSymbol());
8003
8004 auto DescribeCallsite =
8006 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8007 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8008 R << ore::NV("Callee", ES->getSymbol());
8009 else if (CLI.CB && CLI.CB->getCalledFunction())
8010 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8011 else
8012 R << "unknown callee";
8013 R << "'";
8014 return R;
8015 };
8016
8017 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
8018 if (RequiresLazySave) {
8019 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
8021 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
8023 SDValue NumZaSaveSlicesAddr =
8024 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
8025 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
8026 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8027 DAG.getConstant(1, DL, MVT::i32));
8028 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
8029 MPI, MVT::i16);
8030 Chain = DAG.getNode(
8031 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8032 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8033 TPIDR2ObjAddr);
8035 ORE.emit([&]() {
8036 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8037 CLI.CB)
8038 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8039 &MF.getFunction());
8040 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8041 });
8042 }
8043
8044 SDValue PStateSM;
8045 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8046 if (RequiresSMChange) {
8047 if (CallerAttrs.hasStreamingInterfaceOrBody())
8048 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8049 else if (CallerAttrs.hasNonStreamingInterface())
8050 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8051 else
8052 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8054 ORE.emit([&]() {
8055 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8056 CLI.CB)
8057 : OptimizationRemarkAnalysis("sme", "SMETransition",
8058 &MF.getFunction());
8059 DescribeCallsite(R) << " requires a streaming mode transition";
8060 return R;
8061 });
8062 }
8063
8064 SDValue ZTFrameIdx;
8065 MachineFrameInfo &MFI = MF.getFrameInfo();
8066 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8067
8068 // If the caller has ZT0 state which will not be preserved by the callee,
8069 // spill ZT0 before the call.
8070 if (ShouldPreserveZT0) {
8071 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8072 ZTFrameIdx = DAG.getFrameIndex(
8073 ZTObj,
8075
8076 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8077 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8078 }
8079
8080 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8081 // PSTATE.ZA before the call if there is no lazy-save active.
8082 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8083 assert((!DisableZA || !RequiresLazySave) &&
8084 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8085
8086 if (DisableZA)
8087 Chain = DAG.getNode(
8088 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8089 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8090 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8091
8092 // Adjust the stack pointer for the new arguments...
8093 // These operations are automatically eliminated by the prolog/epilog pass
8094 if (!IsSibCall)
8095 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8096
8097 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8099
8101 SmallSet<unsigned, 8> RegsUsed;
8102 SmallVector<SDValue, 8> MemOpChains;
8103 auto PtrVT = getPointerTy(DAG.getDataLayout());
8104
8105 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8106 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8107 for (const auto &F : Forwards) {
8108 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8109 RegsToPass.emplace_back(F.PReg, Val);
8110 }
8111 }
8112
8113 // Walk the register/memloc assignments, inserting copies/loads.
8114 unsigned ExtraArgLocs = 0;
8115 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8116 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8117 SDValue Arg = OutVals[i];
8118 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8119
8120 // Promote the value if needed.
8121 switch (VA.getLocInfo()) {
8122 default:
8123 llvm_unreachable("Unknown loc info!");
8124 case CCValAssign::Full:
8125 break;
8126 case CCValAssign::SExt:
8127 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8128 break;
8129 case CCValAssign::ZExt:
8130 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8131 break;
8132 case CCValAssign::AExt:
8133 if (Outs[i].ArgVT == MVT::i1) {
8134 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8135 //
8136 // Check if we actually have to do this, because the value may
8137 // already be zero-extended.
8138 //
8139 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8140 // and rely on DAGCombiner to fold this, because the following
8141 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8142 //
8143 // (ext (zext x)) -> (zext x)
8144 //
8145 // This will give us (zext i32), which we cannot remove, so
8146 // try to check this beforehand.
8147 if (!checkZExtBool(Arg, DAG)) {
8148 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8149 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8150 }
8151 }
8152 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8153 break;
8155 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8156 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8157 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8158 DAG.getConstant(32, DL, VA.getLocVT()));
8159 break;
8160 case CCValAssign::BCvt:
8161 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8162 break;
8163 case CCValAssign::Trunc:
8164 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8165 break;
8166 case CCValAssign::FPExt:
8167 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8168 break;
8170 bool isScalable = VA.getValVT().isScalableVT();
8171 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8172 "Indirect arguments should be scalable on most subtargets");
8173
8174 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8175 uint64_t PartSize = StoreSize;
8176 unsigned NumParts = 1;
8177 if (Outs[i].Flags.isInConsecutiveRegs()) {
8178 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8179 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8180 ++NumParts;
8181 StoreSize *= NumParts;
8182 }
8183
8184 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8185 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8186 MachineFrameInfo &MFI = MF.getFrameInfo();
8187 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8188 if (isScalable)
8190
8194 SDValue SpillSlot = Ptr;
8195
8196 // Ensure we generate all stores for each tuple part, whilst updating the
8197 // pointer after each store correctly using vscale.
8198 while (NumParts) {
8199 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8200 MemOpChains.push_back(Store);
8201
8202 NumParts--;
8203 if (NumParts > 0) {
8204 SDValue BytesIncrement;
8205 if (isScalable) {
8206 BytesIncrement = DAG.getVScale(
8207 DL, Ptr.getValueType(),
8208 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8209 } else {
8210 BytesIncrement = DAG.getConstant(
8211 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8212 Ptr.getValueType());
8213 }
8215 Flags.setNoUnsignedWrap(true);
8216
8217 MPI = MachinePointerInfo(MPI.getAddrSpace());
8218 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8219 BytesIncrement, Flags);
8220 ExtraArgLocs++;
8221 i++;
8222 }
8223 }
8224
8225 Arg = SpillSlot;
8226 break;
8227 }
8228
8229 if (VA.isRegLoc()) {
8230 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8231 Outs[0].VT == MVT::i64) {
8232 assert(VA.getLocVT() == MVT::i64 &&
8233 "unexpected calling convention register assignment");
8234 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8235 "unexpected use of 'returned'");
8236 IsThisReturn = true;
8237 }
8238 if (RegsUsed.count(VA.getLocReg())) {
8239 // If this register has already been used then we're trying to pack
8240 // parts of an [N x i32] into an X-register. The extension type will
8241 // take care of putting the two halves in the right place but we have to
8242 // combine them.
8243 SDValue &Bits =
8244 llvm::find_if(RegsToPass,
8245 [=](const std::pair<unsigned, SDValue> &Elt) {
8246 return Elt.first == VA.getLocReg();
8247 })
8248 ->second;
8249 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8250 // Call site info is used for function's parameter entry value
8251 // tracking. For now we track only simple cases when parameter
8252 // is transferred through whole register.
8254 [&VA](MachineFunction::ArgRegPair ArgReg) {
8255 return ArgReg.Reg == VA.getLocReg();
8256 });
8257 } else {
8258 // Add an extra level of indirection for streaming mode changes by
8259 // using a pseudo copy node that cannot be rematerialised between a
8260 // smstart/smstop and the call by the simple register coalescer.
8261 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8263 Arg.getValueType(), Arg);
8264 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8265 RegsUsed.insert(VA.getLocReg());
8266 const TargetOptions &Options = DAG.getTarget().Options;
8267 if (Options.EmitCallSiteInfo)
8268 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8269 }
8270 } else {
8271 assert(VA.isMemLoc());
8272
8273 SDValue DstAddr;
8274 MachinePointerInfo DstInfo;
8275
8276 // FIXME: This works on big-endian for composite byvals, which are the
8277 // common case. It should also work for fundamental types too.
8278 uint32_t BEAlign = 0;
8279 unsigned OpSize;
8280 if (VA.getLocInfo() == CCValAssign::Indirect ||
8282 OpSize = VA.getLocVT().getFixedSizeInBits();
8283 else
8284 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8285 : VA.getValVT().getSizeInBits();
8286 OpSize = (OpSize + 7) / 8;
8287 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8288 !Flags.isInConsecutiveRegs()) {
8289 if (OpSize < 8)
8290 BEAlign = 8 - OpSize;
8291 }
8292 unsigned LocMemOffset = VA.getLocMemOffset();
8293 int32_t Offset = LocMemOffset + BEAlign;
8294 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8295 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8296
8297 if (IsTailCall) {
8298 Offset = Offset + FPDiff;
8299 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8300
8301 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8302 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8303
8304 // Make sure any stack arguments overlapping with where we're storing
8305 // are loaded before this eventual operation. Otherwise they'll be
8306 // clobbered.
8307 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8308 } else {
8309 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8310
8311 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8312 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8313 }
8314
8315 if (Outs[i].Flags.isByVal()) {
8316 SDValue SizeNode =
8317 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8318 SDValue Cpy = DAG.getMemcpy(
8319 Chain, DL, DstAddr, Arg, SizeNode,
8320 Outs[i].Flags.getNonZeroByValAlign(),
8321 /*isVol = */ false, /*AlwaysInline = */ false,
8322 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8323
8324 MemOpChains.push_back(Cpy);
8325 } else {
8326 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8327 // promoted to a legal register type i32, we should truncate Arg back to
8328 // i1/i8/i16.
8329 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8330 VA.getValVT() == MVT::i16)
8331 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8332
8333 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8334 MemOpChains.push_back(Store);
8335 }
8336 }
8337 }
8338
8339 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8340 SDValue ParamPtr = StackPtr;
8341 if (IsTailCall) {
8342 // Create a dummy object at the top of the stack that can be used to get
8343 // the SP after the epilogue
8344 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8345 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8346 }
8347
8348 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8349 // describing the argument list. x4 contains the address of the
8350 // first stack parameter. x5 contains the size in bytes of all parameters
8351 // passed on the stack.
8352 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8353 RegsToPass.emplace_back(AArch64::X5,
8354 DAG.getConstant(NumBytes, DL, MVT::i64));
8355 }
8356
8357 if (!MemOpChains.empty())
8358 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8359
8360 SDValue InGlue;
8361 if (RequiresSMChange) {
8362 SDValue NewChain = changeStreamingMode(
8363 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8364 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8365 Chain = NewChain.getValue(0);
8366 InGlue = NewChain.getValue(1);
8367 }
8368
8369 // Build a sequence of copy-to-reg nodes chained together with token chain
8370 // and flag operands which copy the outgoing args into the appropriate regs.
8371 for (auto &RegToPass : RegsToPass) {
8372 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8373 RegToPass.second, InGlue);
8374 InGlue = Chain.getValue(1);
8375 }
8376
8377 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8378 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8379 // node so that legalize doesn't hack it.
8380 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8381 auto GV = G->getGlobal();
8382 unsigned OpFlags =
8384 if (OpFlags & AArch64II::MO_GOT) {
8385 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8386 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8387 } else {
8388 const GlobalValue *GV = G->getGlobal();
8389 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8390 }
8391 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8392 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8393 Subtarget->isTargetMachO()) ||
8395 const char *Sym = S->getSymbol();
8396 if (UseGot) {
8398 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8399 } else {
8400 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8401 }
8402 }
8403
8404 // We don't usually want to end the call-sequence here because we would tidy
8405 // the frame up *after* the call, however in the ABI-changing tail-call case
8406 // we've carefully laid out the parameters so that when sp is reset they'll be
8407 // in the correct location.
8408 if (IsTailCall && !IsSibCall) {
8409 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8410 InGlue = Chain.getValue(1);
8411 }
8412
8413 std::vector<SDValue> Ops;
8414 Ops.push_back(Chain);
8415 Ops.push_back(Callee);
8416
8417 if (IsTailCall) {
8418 // Each tail call may have to adjust the stack by a different amount, so
8419 // this information must travel along with the operation for eventual
8420 // consumption by emitEpilogue.
8421 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8422 }
8423
8424 // Add argument registers to the end of the list so that they are known live
8425 // into the call.
8426 for (auto &RegToPass : RegsToPass)
8427 Ops.push_back(DAG.getRegister(RegToPass.first,
8428 RegToPass.second.getValueType()));
8429
8430 // Add a register mask operand representing the call-preserved registers.
8431 const uint32_t *Mask;
8432 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8433 if (IsThisReturn) {
8434 // For 'this' returns, use the X0-preserving mask if applicable
8435 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8436 if (!Mask) {
8437 IsThisReturn = false;
8438 Mask = TRI->getCallPreservedMask(MF, CallConv);
8439 }
8440 } else
8441 Mask = TRI->getCallPreservedMask(MF, CallConv);
8442
8443 if (Subtarget->hasCustomCallingConv())
8444 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8445
8446 if (TRI->isAnyArgRegReserved(MF))
8447 TRI->emitReservedArgRegCallError(MF);
8448
8449 assert(Mask && "Missing call preserved mask for calling convention");
8450 Ops.push_back(DAG.getRegisterMask(Mask));
8451
8452 if (InGlue.getNode())
8453 Ops.push_back(InGlue);
8454
8455 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8456
8457 // If we're doing a tall call, use a TC_RETURN here rather than an
8458 // actual call instruction.
8459 if (IsTailCall) {
8461 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8462
8463 if (IsCFICall)
8464 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8465
8466 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8467 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8468 return Ret;
8469 }
8470
8471 unsigned CallOpc = AArch64ISD::CALL;
8472 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8473 // be expanded to the call, directly followed by a special marker sequence and
8474 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8475 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8476 assert(!IsTailCall &&
8477 "tail calls cannot be marked with clang.arc.attachedcall");
8478 CallOpc = AArch64ISD::CALL_RVMARKER;
8479
8480 // Add a target global address for the retainRV/claimRV runtime function
8481 // just before the call target.
8482 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8483 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8484 Ops.insert(Ops.begin() + 1, GA);
8485 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8487 } else if (GuardWithBTI) {
8488 CallOpc = AArch64ISD::CALL_BTI;
8489 }
8490
8491 // Returns a chain and a flag for retval copy to use.
8492 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8493
8494 if (IsCFICall)
8495 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8496
8497 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8498 InGlue = Chain.getValue(1);
8499 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8500
8501 uint64_t CalleePopBytes =
8502 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8503
8504 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8505 InGlue = Chain.getValue(1);
8506
8507 // Handle result values, copying them out of physregs into vregs that we
8508 // return.
8509 SDValue Result = LowerCallResult(
8510 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8511 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8512
8513 if (!Ins.empty())
8514 InGlue = Result.getValue(Result->getNumValues() - 1);
8515
8516 if (RequiresSMChange) {
8517 assert(PStateSM && "Expected a PStateSM to be set");
8519 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8520 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8521 }
8522
8523 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8524 // Unconditionally resume ZA.
8525 Result = DAG.getNode(
8526 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8527 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8528 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8529
8530 if (ShouldPreserveZT0)
8531 Result =
8532 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8533 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8534
8535 if (RequiresLazySave) {
8536 // Conditionally restore the lazy save using a pseudo node.
8537 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8538 SDValue RegMask = DAG.getRegisterMask(
8539 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8540 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8541 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8542 SDValue TPIDR2_EL0 = DAG.getNode(
8543 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8544 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8545
8546 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8547 // RESTORE_ZA pseudo.
8548 SDValue Glue;
8549 SDValue TPIDR2Block = DAG.getFrameIndex(
8551 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8552 Result =
8553 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8554 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8555 RestoreRoutine, RegMask, Result.getValue(1)});
8556
8557 // Finally reset the TPIDR2_EL0 register to 0.
8558 Result = DAG.getNode(
8559 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8560 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8561 DAG.getConstant(0, DL, MVT::i64));
8562 }
8563
8564 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8565 for (unsigned I = 0; I < InVals.size(); ++I) {
8566 // The smstart/smstop is chained as part of the call, but when the
8567 // resulting chain is discarded (which happens when the call is not part
8568 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8569 // smstart/smstop is chained to the result value. We can do that by doing
8570 // a vreg -> vreg copy.
8572 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8573 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8574 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8575 InVals[I].getValueType());
8576 }
8577 }
8578
8579 return Result;
8580}
8581
8582bool AArch64TargetLowering::CanLowerReturn(
8583 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8584 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8585 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8587 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8588 return CCInfo.CheckReturn(Outs, RetCC);
8589}
8590
8591SDValue
8592AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8593 bool isVarArg,
8595 const SmallVectorImpl<SDValue> &OutVals,
8596 const SDLoc &DL, SelectionDAG &DAG) const {
8597 auto &MF = DAG.getMachineFunction();
8598 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8599
8600 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8602 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8603 CCInfo.AnalyzeReturn(Outs, RetCC);
8604
8605 // Copy the result values into the output registers.
8606 SDValue Glue;
8608 SmallSet<unsigned, 4> RegsUsed;
8609 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8610 ++i, ++realRVLocIdx) {
8611 CCValAssign &VA = RVLocs[i];
8612 assert(VA.isRegLoc() && "Can only return in registers!");
8613 SDValue Arg = OutVals[realRVLocIdx];
8614
8615 switch (VA.getLocInfo()) {
8616 default:
8617 llvm_unreachable("Unknown loc info!");
8618 case CCValAssign::Full:
8619 if (Outs[i].ArgVT == MVT::i1) {
8620 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8621 // value. This is strictly redundant on Darwin (which uses "zeroext
8622 // i1"), but will be optimised out before ISel.
8623 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8624 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8625 }
8626 break;
8627 case CCValAssign::BCvt:
8628 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8629 break;
8630 case CCValAssign::AExt:
8631 case CCValAssign::ZExt:
8632 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8633 break;
8635 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8636 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8637 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8638 DAG.getConstant(32, DL, VA.getLocVT()));
8639 break;
8640 }
8641
8642 if (RegsUsed.count(VA.getLocReg())) {
8643 SDValue &Bits =
8644 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8645 return Elt.first == VA.getLocReg();
8646 })->second;
8647 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8648 } else {
8649 RetVals.emplace_back(VA.getLocReg(), Arg);
8650 RegsUsed.insert(VA.getLocReg());
8651 }
8652 }
8653
8654 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8655
8656 // Emit SMSTOP before returning from a locally streaming function
8657 SMEAttrs FuncAttrs(MF.getFunction());
8658 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8659 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8660 Register Reg = FuncInfo->getPStateSMReg();
8661 assert(Reg.isValid() && "PStateSM Register is invalid");
8662 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8663 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8664 /*Glue*/ SDValue(),
8666 } else
8667 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8668 /*Glue*/ SDValue(), AArch64SME::Always);
8669 Glue = Chain.getValue(1);
8670 }
8671
8672 SmallVector<SDValue, 4> RetOps(1, Chain);
8673 for (auto &RetVal : RetVals) {
8674 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8675 isPassedInFPR(RetVal.second.getValueType()))
8676 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8677 RetVal.second.getValueType(), RetVal.second);
8678 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8679 Glue = Chain.getValue(1);
8680 RetOps.push_back(
8681 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8682 }
8683
8684 // Windows AArch64 ABIs require that for returning structs by value we copy
8685 // the sret argument into X0 for the return.
8686 // We saved the argument into a virtual register in the entry block,
8687 // so now we copy the value out and into X0.
8688 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8689 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8691
8692 unsigned RetValReg = AArch64::X0;
8693 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8694 RetValReg = AArch64::X8;
8695 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8696 Glue = Chain.getValue(1);
8697
8698 RetOps.push_back(
8699 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8700 }
8701
8702 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8703 if (I) {
8704 for (; *I; ++I) {
8705 if (AArch64::GPR64RegClass.contains(*I))
8706 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8707 else if (AArch64::FPR64RegClass.contains(*I))
8708 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8709 else
8710 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8711 }
8712 }
8713
8714 RetOps[0] = Chain; // Update chain.
8715
8716 // Add the glue if we have it.
8717 if (Glue.getNode())
8718 RetOps.push_back(Glue);
8719
8720 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8721 // ARM64EC entry thunks use a special return sequence: instead of a regular
8722 // "ret" instruction, they need to explicitly call the emulator.
8723 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8724 SDValue Arm64ECRetDest =
8725 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8726 Arm64ECRetDest =
8727 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8728 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8730 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8731 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8732 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8733 }
8734
8735 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8736}
8737
8738//===----------------------------------------------------------------------===//
8739// Other Lowering Code
8740//===----------------------------------------------------------------------===//
8741
8742SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8743 SelectionDAG &DAG,
8744 unsigned Flag) const {
8745 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8746 N->getOffset(), Flag);
8747}
8748
8749SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8750 SelectionDAG &DAG,
8751 unsigned Flag) const {
8752 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8753}
8754
8755SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8756 SelectionDAG &DAG,
8757 unsigned Flag) const {
8758 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8759 N->getOffset(), Flag);
8760}
8761
8762SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8763 SelectionDAG &DAG,
8764 unsigned Flag) const {
8765 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8766}
8767
8768SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8769 SelectionDAG &DAG,
8770 unsigned Flag) const {
8771 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8772}
8773
8774// (loadGOT sym)
8775template <class NodeTy>
8776SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8777 unsigned Flags) const {
8778 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8779 SDLoc DL(N);
8780 EVT Ty = getPointerTy(DAG.getDataLayout());
8781 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8782 // FIXME: Once remat is capable of dealing with instructions with register
8783 // operands, expand this into two nodes instead of using a wrapper node.
8784 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8785}
8786
8787// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8788template <class NodeTy>
8789SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8790 unsigned Flags) const {
8791 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8792 SDLoc DL(N);
8793 EVT Ty = getPointerTy(DAG.getDataLayout());
8794 const unsigned char MO_NC = AArch64II::MO_NC;
8795 return DAG.getNode(
8797 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8798 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8799 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8800 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8801}
8802
8803// (addlow (adrp %hi(sym)) %lo(sym))
8804template <class NodeTy>
8805SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8806 unsigned Flags) const {
8807 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8808 SDLoc DL(N);
8809 EVT Ty = getPointerTy(DAG.getDataLayout());
8810 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8811 SDValue Lo = getTargetNode(N, Ty, DAG,
8814 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8815}
8816
8817// (adr sym)
8818template <class NodeTy>
8819SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8820 unsigned Flags) const {
8821 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8822 SDLoc DL(N);
8823 EVT Ty = getPointerTy(DAG.getDataLayout());
8824 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8825 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8826}
8827
8828SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8829 SelectionDAG &DAG) const {
8830 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8831 const GlobalValue *GV = GN->getGlobal();
8832 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8833
8834 if (OpFlags != AArch64II::MO_NO_FLAG)
8835 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8836 "unexpected offset in global node");
8837
8838 // This also catches the large code model case for Darwin, and tiny code
8839 // model with got relocations.
8840 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8841 return getGOT(GN, DAG, OpFlags);
8842 }
8843
8847 Result = getAddrLarge(GN, DAG, OpFlags);
8848 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8849 Result = getAddrTiny(GN, DAG, OpFlags);
8850 } else {
8851 Result = getAddr(GN, DAG, OpFlags);
8852 }
8853 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8854 SDLoc DL(GN);
8856 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8858 return Result;
8859}
8860
8861/// Convert a TLS address reference into the correct sequence of loads
8862/// and calls to compute the variable's address (for Darwin, currently) and
8863/// return an SDValue containing the final node.
8864
8865/// Darwin only has one TLS scheme which must be capable of dealing with the
8866/// fully general situation, in the worst case. This means:
8867/// + "extern __thread" declaration.
8868/// + Defined in a possibly unknown dynamic library.
8869///
8870/// The general system is that each __thread variable has a [3 x i64] descriptor
8871/// which contains information used by the runtime to calculate the address. The
8872/// only part of this the compiler needs to know about is the first xword, which
8873/// contains a function pointer that must be called with the address of the
8874/// entire descriptor in "x0".
8875///
8876/// Since this descriptor may be in a different unit, in general even the
8877/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8878/// is:
8879/// adrp x0, _var@TLVPPAGE
8880/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8881/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8882/// ; the function pointer
8883/// blr x1 ; Uses descriptor address in x0
8884/// ; Address of _var is now in x0.
8885///
8886/// If the address of _var's descriptor *is* known to the linker, then it can
8887/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8888/// a slight efficiency gain.
8889SDValue
8890AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8891 SelectionDAG &DAG) const {
8892 assert(Subtarget->isTargetDarwin() &&
8893 "This function expects a Darwin target");
8894
8895 SDLoc DL(Op);
8896 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8897 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8898 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8899
8900 SDValue TLVPAddr =
8901 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8902 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8903
8904 // The first entry in the descriptor is a function pointer that we must call
8905 // to obtain the address of the variable.
8906 SDValue Chain = DAG.getEntryNode();
8907 SDValue FuncTLVGet = DAG.getLoad(
8908 PtrMemVT, DL, Chain, DescAddr,
8910 Align(PtrMemVT.getSizeInBits() / 8),
8912 Chain = FuncTLVGet.getValue(1);
8913
8914 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8915 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8916
8918 MFI.setAdjustsStack(true);
8919
8920 // TLS calls preserve all registers except those that absolutely must be
8921 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8922 // silly).
8923 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8924 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8925 if (Subtarget->hasCustomCallingConv())
8926 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8927
8928 // Finally, we can make the call. This is just a degenerate version of a
8929 // normal AArch64 call node: x0 takes the address of the descriptor, and
8930 // returns the address of the variable in this thread.
8931 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8932 Chain =
8933 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8934 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8935 DAG.getRegisterMask(Mask), Chain.getValue(1));
8936 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8937}
8938
8939/// Convert a thread-local variable reference into a sequence of instructions to
8940/// compute the variable's address for the local exec TLS model of ELF targets.
8941/// The sequence depends on the maximum TLS area size.
8942SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8943 SDValue ThreadBase,
8944 const SDLoc &DL,
8945 SelectionDAG &DAG) const {
8946 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8947 SDValue TPOff, Addr;
8948
8949 switch (DAG.getTarget().Options.TLSSize) {
8950 default:
8951 llvm_unreachable("Unexpected TLS size");
8952
8953 case 12: {
8954 // mrs x0, TPIDR_EL0
8955 // add x0, x0, :tprel_lo12:a
8957 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8958 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8959 Var,
8960 DAG.getTargetConstant(0, DL, MVT::i32)),
8961 0);
8962 }
8963
8964 case 24: {
8965 // mrs x0, TPIDR_EL0
8966 // add x0, x0, :tprel_hi12:a
8967 // add x0, x0, :tprel_lo12_nc:a
8968 SDValue HiVar = DAG.getTargetGlobalAddress(
8969 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8970 SDValue LoVar = DAG.getTargetGlobalAddress(
8971 GV, DL, PtrVT, 0,
8973 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8974 HiVar,
8975 DAG.getTargetConstant(0, DL, MVT::i32)),
8976 0);
8977 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8978 LoVar,
8979 DAG.getTargetConstant(0, DL, MVT::i32)),
8980 0);
8981 }
8982
8983 case 32: {
8984 // mrs x1, TPIDR_EL0
8985 // movz x0, #:tprel_g1:a
8986 // movk x0, #:tprel_g0_nc:a
8987 // add x0, x1, x0
8988 SDValue HiVar = DAG.getTargetGlobalAddress(
8989 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8990 SDValue LoVar = DAG.getTargetGlobalAddress(
8991 GV, DL, PtrVT, 0,
8993 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8994 DAG.getTargetConstant(16, DL, MVT::i32)),
8995 0);
8996 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8997 DAG.getTargetConstant(0, DL, MVT::i32)),
8998 0);
8999 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9000 }
9001
9002 case 48: {
9003 // mrs x1, TPIDR_EL0
9004 // movz x0, #:tprel_g2:a
9005 // movk x0, #:tprel_g1_nc:a
9006 // movk x0, #:tprel_g0_nc:a
9007 // add x0, x1, x0
9008 SDValue HiVar = DAG.getTargetGlobalAddress(
9009 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
9010 SDValue MiVar = DAG.getTargetGlobalAddress(
9011 GV, DL, PtrVT, 0,
9013 SDValue LoVar = DAG.getTargetGlobalAddress(
9014 GV, DL, PtrVT, 0,
9016 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9017 DAG.getTargetConstant(32, DL, MVT::i32)),
9018 0);
9019 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
9020 DAG.getTargetConstant(16, DL, MVT::i32)),
9021 0);
9022 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9023 DAG.getTargetConstant(0, DL, MVT::i32)),
9024 0);
9025 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9026 }
9027 }
9028}
9029
9030/// When accessing thread-local variables under either the general-dynamic or
9031/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9032/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9033/// is a function pointer to carry out the resolution.
9034///
9035/// The sequence is:
9036/// adrp x0, :tlsdesc:var
9037/// ldr x1, [x0, #:tlsdesc_lo12:var]
9038/// add x0, x0, #:tlsdesc_lo12:var
9039/// .tlsdesccall var
9040/// blr x1
9041/// (TPIDR_EL0 offset now in x0)
9042///
9043/// The above sequence must be produced unscheduled, to enable the linker to
9044/// optimize/relax this sequence.
9045/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9046/// above sequence, and expanded really late in the compilation flow, to ensure
9047/// the sequence is produced as per above.
9048SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9049 const SDLoc &DL,
9050 SelectionDAG &DAG) const {
9051 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9052
9053 SDValue Chain = DAG.getEntryNode();
9054 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9055
9056 Chain =
9057 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9058 SDValue Glue = Chain.getValue(1);
9059
9060 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9061}
9062
9063SDValue
9064AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9065 SelectionDAG &DAG) const {
9066 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9067
9068 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9069
9071
9073 if (Model == TLSModel::LocalDynamic)
9075 }
9076
9078 Model != TLSModel::LocalExec)
9079 report_fatal_error("ELF TLS only supported in small memory model or "
9080 "in local exec TLS model");
9081 // Different choices can be made for the maximum size of the TLS area for a
9082 // module. For the small address model, the default TLS size is 16MiB and the
9083 // maximum TLS size is 4GiB.
9084 // FIXME: add tiny and large code model support for TLS access models other
9085 // than local exec. We currently generate the same code as small for tiny,
9086 // which may be larger than needed.
9087
9088 SDValue TPOff;
9089 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9090 SDLoc DL(Op);
9091 const GlobalValue *GV = GA->getGlobal();
9092
9093 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9094
9095 if (Model == TLSModel::LocalExec) {
9096 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9097 } else if (Model == TLSModel::InitialExec) {
9098 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9099 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9100 } else if (Model == TLSModel::LocalDynamic) {
9101 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9102 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9103 // the beginning of the module's TLS region, followed by a DTPREL offset
9104 // calculation.
9105
9106 // These accesses will need deduplicating if there's more than one.
9107 AArch64FunctionInfo *MFI =
9110
9111 // The call needs a relocation too for linker relaxation. It doesn't make
9112 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9113 // the address.
9114 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9116
9117 // Now we can calculate the offset from TPIDR_EL0 to this module's
9118 // thread-local area.
9119 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9120
9121 // Now use :dtprel_whatever: operations to calculate this variable's offset
9122 // in its thread-storage area.
9123 SDValue HiVar = DAG.getTargetGlobalAddress(
9124 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9125 SDValue LoVar = DAG.getTargetGlobalAddress(
9126 GV, DL, MVT::i64, 0,
9128
9129 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9130 DAG.getTargetConstant(0, DL, MVT::i32)),
9131 0);
9132 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9133 DAG.getTargetConstant(0, DL, MVT::i32)),
9134 0);
9135 } else if (Model == TLSModel::GeneralDynamic) {
9136 // The call needs a relocation too for linker relaxation. It doesn't make
9137 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9138 // the address.
9139 SDValue SymAddr =
9140 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9141
9142 // Finally we can make a call to calculate the offset from tpidr_el0.
9143 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9144 } else
9145 llvm_unreachable("Unsupported ELF TLS access model");
9146
9147 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9148}
9149
9150SDValue
9151AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9152 SelectionDAG &DAG) const {
9153 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9154
9155 SDValue Chain = DAG.getEntryNode();
9156 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9157 SDLoc DL(Op);
9158
9159 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9160
9161 // Load the ThreadLocalStoragePointer from the TEB
9162 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9163 SDValue TLSArray =
9164 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9165 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9166 Chain = TLSArray.getValue(1);
9167
9168 // Load the TLS index from the C runtime;
9169 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9170 // This also does the same as LOADgot, but using a generic i32 load,
9171 // while LOADgot only loads i64.
9172 SDValue TLSIndexHi =
9173 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9174 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9175 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9176 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9177 SDValue TLSIndex =
9178 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9179 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9180 Chain = TLSIndex.getValue(1);
9181
9182 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9183 // offset into the TLSArray.
9184 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9185 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9186 DAG.getConstant(3, DL, PtrVT));
9187 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9188 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9190 Chain = TLS.getValue(1);
9191
9192 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9193 const GlobalValue *GV = GA->getGlobal();
9194 SDValue TGAHi = DAG.getTargetGlobalAddress(
9195 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9196 SDValue TGALo = DAG.getTargetGlobalAddress(
9197 GV, DL, PtrVT, 0,
9199
9200 // Add the offset from the start of the .tls section (section base).
9201 SDValue Addr =
9202 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9203 DAG.getTargetConstant(0, DL, MVT::i32)),
9204 0);
9205 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9206 return Addr;
9207}
9208
9209SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9210 SelectionDAG &DAG) const {
9211 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9212 if (DAG.getTarget().useEmulatedTLS())
9213 return LowerToTLSEmulatedModel(GA, DAG);
9214
9215 if (Subtarget->isTargetDarwin())
9216 return LowerDarwinGlobalTLSAddress(Op, DAG);
9217 if (Subtarget->isTargetELF())
9218 return LowerELFGlobalTLSAddress(Op, DAG);
9219 if (Subtarget->isTargetWindows())
9220 return LowerWindowsGlobalTLSAddress(Op, DAG);
9221
9222 llvm_unreachable("Unexpected platform trying to use TLS");
9223}
9224
9225// Looks through \param Val to determine the bit that can be used to
9226// check the sign of the value. It returns the unextended value and
9227// the sign bit position.
9228std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9229 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9230 return {Val.getOperand(0),
9231 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9232 1};
9233
9234 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9235 return {Val.getOperand(0),
9236 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9237
9238 return {Val, Val.getValueSizeInBits() - 1};
9239}
9240
9241SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9242 SDValue Chain = Op.getOperand(0);
9243 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9244 SDValue LHS = Op.getOperand(2);
9245 SDValue RHS = Op.getOperand(3);
9246 SDValue Dest = Op.getOperand(4);
9247 SDLoc dl(Op);
9248
9250 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9251 // will not be produced, as they are conditional branch instructions that do
9252 // not set flags.
9253 bool ProduceNonFlagSettingCondBr =
9254 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9255
9256 // Handle f128 first, since lowering it will result in comparing the return
9257 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9258 // is expecting to deal with.
9259 if (LHS.getValueType() == MVT::f128) {
9260 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9261
9262 // If softenSetCCOperands returned a scalar, we need to compare the result
9263 // against zero to select between true and false values.
9264 if (!RHS.getNode()) {
9265 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9266 CC = ISD::SETNE;
9267 }
9268 }
9269
9270 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9271 // instruction.
9272 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9273 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9274 // Only lower legal XALUO ops.
9275 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9276 return SDValue();
9277
9278 // The actual operation with overflow check.
9280 SDValue Value, Overflow;
9281 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9282
9283 if (CC == ISD::SETNE)
9284 OFCC = getInvertedCondCode(OFCC);
9285 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9286
9287 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9288 Overflow);
9289 }
9290
9291 if (LHS.getValueType().isInteger()) {
9292 assert((LHS.getValueType() == RHS.getValueType()) &&
9293 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9294
9295 // If the RHS of the comparison is zero, we can potentially fold this
9296 // to a specialized branch.
9297 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9298 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9299 if (CC == ISD::SETEQ) {
9300 // See if we can use a TBZ to fold in an AND as well.
9301 // TBZ has a smaller branch displacement than CBZ. If the offset is
9302 // out of bounds, a late MI-layer pass rewrites branches.
9303 // 403.gcc is an example that hits this case.
9304 if (LHS.getOpcode() == ISD::AND &&
9305 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9306 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9307 SDValue Test = LHS.getOperand(0);
9308 uint64_t Mask = LHS.getConstantOperandVal(1);
9309 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9310 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9311 Dest);
9312 }
9313
9314 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9315 } else if (CC == ISD::SETNE) {
9316 // See if we can use a TBZ to fold in an AND as well.
9317 // TBZ has a smaller branch displacement than CBZ. If the offset is
9318 // out of bounds, a late MI-layer pass rewrites branches.
9319 // 403.gcc is an example that hits this case.
9320 if (LHS.getOpcode() == ISD::AND &&
9321 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9322 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9323 SDValue Test = LHS.getOperand(0);
9324 uint64_t Mask = LHS.getConstantOperandVal(1);
9325 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9326 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9327 Dest);
9328 }
9329
9330 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9331 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9332 // Don't combine AND since emitComparison converts the AND to an ANDS
9333 // (a.k.a. TST) and the test in the test bit and branch instruction
9334 // becomes redundant. This would also increase register pressure.
9335 uint64_t SignBitPos;
9336 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9337 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9338 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9339 }
9340 }
9341 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9342 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9343 // Don't combine AND since emitComparison converts the AND to an ANDS
9344 // (a.k.a. TST) and the test in the test bit and branch instruction
9345 // becomes redundant. This would also increase register pressure.
9346 uint64_t SignBitPos;
9347 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9348 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9349 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9350 }
9351
9352 SDValue CCVal;
9353 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9354 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9355 Cmp);
9356 }
9357
9358 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9359 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9360
9361 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9362 // clean. Some of them require two branches to implement.
9363 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9364 AArch64CC::CondCode CC1, CC2;
9365 changeFPCCToAArch64CC(CC, CC1, CC2);
9366 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9367 SDValue BR1 =
9368 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9369 if (CC2 != AArch64CC::AL) {
9370 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9371 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9372 Cmp);
9373 }
9374
9375 return BR1;
9376}
9377
9378SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9379 SelectionDAG &DAG) const {
9380 if (!Subtarget->hasNEON())
9381 return SDValue();
9382
9383 EVT VT = Op.getValueType();
9384 EVT IntVT = VT.changeTypeToInteger();
9385 SDLoc DL(Op);
9386
9387 SDValue In1 = Op.getOperand(0);
9388 SDValue In2 = Op.getOperand(1);
9389 EVT SrcVT = In2.getValueType();
9390
9391 if (!SrcVT.bitsEq(VT))
9392 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9393
9394 if (VT.isScalableVector())
9395 IntVT =
9397
9398 if (VT.isFixedLengthVector() &&
9399 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9400 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9401
9402 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9403 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9404
9405 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9406 return convertFromScalableVector(DAG, VT, Res);
9407 }
9408
9409 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9410 if (VT.isScalableVector())
9411 return getSVESafeBitCast(VT, Op, DAG);
9412
9413 return DAG.getBitcast(VT, Op);
9414 };
9415
9416 SDValue VecVal1, VecVal2;
9417 EVT VecVT;
9418 auto SetVecVal = [&](int Idx = -1) {
9419 if (!VT.isVector()) {
9420 VecVal1 =
9421 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9422 VecVal2 =
9423 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9424 } else {
9425 VecVal1 = BitCast(VecVT, In1, DAG);
9426 VecVal2 = BitCast(VecVT, In2, DAG);
9427 }
9428 };
9429 if (VT.isVector()) {
9430 VecVT = IntVT;
9431 SetVecVal();
9432 } else if (VT == MVT::f64) {
9433 VecVT = MVT::v2i64;
9434 SetVecVal(AArch64::dsub);
9435 } else if (VT == MVT::f32) {
9436 VecVT = MVT::v4i32;
9437 SetVecVal(AArch64::ssub);
9438 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9439 VecVT = MVT::v8i16;
9440 SetVecVal(AArch64::hsub);
9441 } else {
9442 llvm_unreachable("Invalid type for copysign!");
9443 }
9444
9445 unsigned BitWidth = In1.getScalarValueSizeInBits();
9446 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9447
9448 // We want to materialize a mask with every bit but the high bit set, but the
9449 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9450 // 64-bit elements. Instead, materialize all bits set and then negate that.
9451 if (VT == MVT::f64 || VT == MVT::v2f64) {
9452 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9453 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9454 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9455 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9456 }
9457
9458 SDValue BSP =
9459 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9460 if (VT == MVT::f16 || VT == MVT::bf16)
9461 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9462 if (VT == MVT::f32)
9463 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9464 if (VT == MVT::f64)
9465 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9466
9467 return BitCast(VT, BSP, DAG);
9468}
9469
9470SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9471 SelectionDAG &DAG) const {
9473 Attribute::NoImplicitFloat))
9474 return SDValue();
9475
9476 if (!Subtarget->hasNEON())
9477 return SDValue();
9478
9479 bool IsParity = Op.getOpcode() == ISD::PARITY;
9480 SDValue Val = Op.getOperand(0);
9481 SDLoc DL(Op);
9482 EVT VT = Op.getValueType();
9483
9484 // for i32, general parity function using EORs is more efficient compared to
9485 // using floating point
9486 if (VT == MVT::i32 && IsParity)
9487 return SDValue();
9488
9489 // If there is no CNT instruction available, GPR popcount can
9490 // be more efficiently lowered to the following sequence that uses
9491 // AdvSIMD registers/instructions as long as the copies to/from
9492 // the AdvSIMD registers are cheap.
9493 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9494 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9495 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9496 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9497 if (VT == MVT::i32 || VT == MVT::i64) {
9498 if (VT == MVT::i32)
9499 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9500 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9501
9502 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9503 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9504 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9505 DAG.getConstant(0, DL, MVT::i64));
9506
9507 if (IsParity)
9508 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9509 DAG.getConstant(1, DL, MVT::i32));
9510
9511 if (VT == MVT::i64)
9512 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9513 return UaddLV;
9514 } else if (VT == MVT::i128) {
9515 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9516
9517 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9518 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9519 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9520 DAG.getConstant(0, DL, MVT::i64));
9521
9522 if (IsParity)
9523 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9524 DAG.getConstant(1, DL, MVT::i32));
9525
9526 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9527 }
9528
9529 assert(!IsParity && "ISD::PARITY of vector types not supported");
9530
9531 if (VT.isScalableVector() ||
9533 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9534
9535 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9536 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9537 "Unexpected type for custom ctpop lowering");
9538
9539 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9540 Val = DAG.getBitcast(VT8Bit, Val);
9541 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9542
9543 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9544 unsigned EltSize = 8;
9545 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9546 while (EltSize != VT.getScalarSizeInBits()) {
9547 EltSize *= 2;
9548 NumElts /= 2;
9549 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9550 Val = DAG.getNode(
9551 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9552 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9553 }
9554
9555 return Val;
9556}
9557
9558SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9559 EVT VT = Op.getValueType();
9560 assert(VT.isScalableVector() ||
9562 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9563
9564 SDLoc DL(Op);
9565 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9566 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9567}
9568
9569SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9570 SelectionDAG &DAG) const {
9571
9572 EVT VT = Op.getValueType();
9573 SDLoc DL(Op);
9574 unsigned Opcode = Op.getOpcode();
9576 switch (Opcode) {
9577 default:
9578 llvm_unreachable("Wrong instruction");
9579 case ISD::SMAX:
9580 CC = ISD::SETGT;
9581 break;
9582 case ISD::SMIN:
9583 CC = ISD::SETLT;
9584 break;
9585 case ISD::UMAX:
9586 CC = ISD::SETUGT;
9587 break;
9588 case ISD::UMIN:
9589 CC = ISD::SETULT;
9590 break;
9591 }
9592
9593 if (VT.isScalableVector() ||
9595 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9596 switch (Opcode) {
9597 default:
9598 llvm_unreachable("Wrong instruction");
9599 case ISD::SMAX:
9600 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9601 case ISD::SMIN:
9602 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9603 case ISD::UMAX:
9604 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9605 case ISD::UMIN:
9606 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9607 }
9608 }
9609
9610 SDValue Op0 = Op.getOperand(0);
9611 SDValue Op1 = Op.getOperand(1);
9612 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9613 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9614}
9615
9616SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9617 SelectionDAG &DAG) const {
9618 EVT VT = Op.getValueType();
9619
9620 if (VT.isScalableVector() ||
9622 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9623 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9624
9625 SDLoc DL(Op);
9626 SDValue REVB;
9627 MVT VST;
9628
9629 switch (VT.getSimpleVT().SimpleTy) {
9630 default:
9631 llvm_unreachable("Invalid type for bitreverse!");
9632
9633 case MVT::v2i32: {
9634 VST = MVT::v8i8;
9635 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9636
9637 break;
9638 }
9639
9640 case MVT::v4i32: {
9641 VST = MVT::v16i8;
9642 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9643
9644 break;
9645 }
9646
9647 case MVT::v1i64: {
9648 VST = MVT::v8i8;
9649 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9650
9651 break;
9652 }
9653
9654 case MVT::v2i64: {
9655 VST = MVT::v16i8;
9656 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9657
9658 break;
9659 }
9660 }
9661
9662 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9663 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9664}
9665
9666// Check whether the continuous comparison sequence.
9667static bool
9668isOrXorChain(SDValue N, unsigned &Num,
9669 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9670 if (Num == MaxXors)
9671 return false;
9672
9673 // Skip the one-use zext
9674 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9675 N = N->getOperand(0);
9676
9677 // The leaf node must be XOR
9678 if (N->getOpcode() == ISD::XOR) {
9679 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9680 Num++;
9681 return true;
9682 }
9683
9684 // All the non-leaf nodes must be OR.
9685 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9686 return false;
9687
9688 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9689 isOrXorChain(N->getOperand(1), Num, WorkList))
9690 return true;
9691 return false;
9692}
9693
9694// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9696 SDValue LHS = N->getOperand(0);
9697 SDValue RHS = N->getOperand(1);
9698 SDLoc DL(N);
9699 EVT VT = N->getValueType(0);
9701
9702 // Only handle integer compares.
9703 if (N->getOpcode() != ISD::SETCC)
9704 return SDValue();
9705
9706 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9707 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9708 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9709 unsigned NumXors = 0;
9710 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9711 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9712 isOrXorChain(LHS, NumXors, WorkList)) {
9713 SDValue XOR0, XOR1;
9714 std::tie(XOR0, XOR1) = WorkList[0];
9715 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9716 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9717 for (unsigned I = 1; I < WorkList.size(); I++) {
9718 std::tie(XOR0, XOR1) = WorkList[I];
9719 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9720 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9721 }
9722
9723 // Exit early by inverting the condition, which help reduce indentations.
9724 return Cmp;
9725 }
9726
9727 return SDValue();
9728}
9729
9730SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9731
9732 if (Op.getValueType().isVector())
9733 return LowerVSETCC(Op, DAG);
9734
9735 bool IsStrict = Op->isStrictFPOpcode();
9736 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9737 unsigned OpNo = IsStrict ? 1 : 0;
9738 SDValue Chain;
9739 if (IsStrict)
9740 Chain = Op.getOperand(0);
9741 SDValue LHS = Op.getOperand(OpNo + 0);
9742 SDValue RHS = Op.getOperand(OpNo + 1);
9743 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9744 SDLoc dl(Op);
9745
9746 // We chose ZeroOrOneBooleanContents, so use zero and one.
9747 EVT VT = Op.getValueType();
9748 SDValue TVal = DAG.getConstant(1, dl, VT);
9749 SDValue FVal = DAG.getConstant(0, dl, VT);
9750
9751 // Handle f128 first, since one possible outcome is a normal integer
9752 // comparison which gets picked up by the next if statement.
9753 if (LHS.getValueType() == MVT::f128) {
9754 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9755 IsSignaling);
9756
9757 // If softenSetCCOperands returned a scalar, use it.
9758 if (!RHS.getNode()) {
9759 assert(LHS.getValueType() == Op.getValueType() &&
9760 "Unexpected setcc expansion!");
9761 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9762 }
9763 }
9764
9765 if (LHS.getValueType().isInteger()) {
9766 SDValue CCVal;
9768 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9769
9770 // Note that we inverted the condition above, so we reverse the order of
9771 // the true and false operands here. This will allow the setcc to be
9772 // matched to a single CSINC instruction.
9773 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9774 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9775 }
9776
9777 // Now we know we're dealing with FP values.
9778 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9779 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9780
9781 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9782 // and do the comparison.
9783 SDValue Cmp;
9784 if (IsStrict)
9785 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9786 else
9787 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9788
9789 AArch64CC::CondCode CC1, CC2;
9790 changeFPCCToAArch64CC(CC, CC1, CC2);
9791 SDValue Res;
9792 if (CC2 == AArch64CC::AL) {
9793 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9794 CC2);
9795 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9796
9797 // Note that we inverted the condition above, so we reverse the order of
9798 // the true and false operands here. This will allow the setcc to be
9799 // matched to a single CSINC instruction.
9800 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9801 } else {
9802 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9803 // totally clean. Some of them require two CSELs to implement. As is in
9804 // this case, we emit the first CSEL and then emit a second using the output
9805 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9806
9807 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9808 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9809 SDValue CS1 =
9810 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9811
9812 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9813 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9814 }
9815 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9816}
9817
9818SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9819 SelectionDAG &DAG) const {
9820
9821 SDValue LHS = Op.getOperand(0);
9822 SDValue RHS = Op.getOperand(1);
9823 EVT VT = LHS.getValueType();
9824 if (VT != MVT::i32 && VT != MVT::i64)
9825 return SDValue();
9826
9827 SDLoc DL(Op);
9828 SDValue Carry = Op.getOperand(2);
9829 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9830 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9831 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9832 LHS, RHS, InvCarry);
9833
9834 EVT OpVT = Op.getValueType();
9835 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9836 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9837
9838 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9840 SDValue CCVal =
9841 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9842 // Inputs are swapped because the condition is inverted. This will allow
9843 // matching with a single CSINC instruction.
9844 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9845 Cmp.getValue(1));
9846}
9847
9848SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9849 SDValue RHS, SDValue TVal,
9850 SDValue FVal, const SDLoc &dl,
9851 SelectionDAG &DAG) const {
9852 // Handle f128 first, because it will result in a comparison of some RTLIB
9853 // call result against zero.
9854 if (LHS.getValueType() == MVT::f128) {
9855 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9856
9857 // If softenSetCCOperands returned a scalar, we need to compare the result
9858 // against zero to select between true and false values.
9859 if (!RHS.getNode()) {
9860 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9861 CC = ISD::SETNE;
9862 }
9863 }
9864
9865 // Also handle f16, for which we need to do a f32 comparison.
9866 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9867 LHS.getValueType() == MVT::bf16) {
9868 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9869 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9870 }
9871
9872 // Next, handle integers.
9873 if (LHS.getValueType().isInteger()) {
9874 assert((LHS.getValueType() == RHS.getValueType()) &&
9875 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9876
9877 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9878 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9879 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9880 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9881 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9882 // supported types.
9883 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9884 CTVal->isOne() && CFVal->isAllOnes() &&
9885 LHS.getValueType() == TVal.getValueType()) {
9886 EVT VT = LHS.getValueType();
9887 SDValue Shift =
9888 DAG.getNode(ISD::SRA, dl, VT, LHS,
9889 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9890 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9891 }
9892
9893 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9894 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9895 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9896 // Both require less instructions than compare and conditional select.
9897 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9898 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9899 LHS.getValueType() == RHS.getValueType()) {
9900 EVT VT = LHS.getValueType();
9901 SDValue Shift =
9902 DAG.getNode(ISD::SRA, dl, VT, LHS,
9903 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9904
9905 if (CC == ISD::SETGT)
9906 Shift = DAG.getNOT(dl, Shift, VT);
9907
9908 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9909 }
9910
9911 unsigned Opcode = AArch64ISD::CSEL;
9912
9913 // If both the TVal and the FVal are constants, see if we can swap them in
9914 // order to for a CSINV or CSINC out of them.
9915 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9916 std::swap(TVal, FVal);
9917 std::swap(CTVal, CFVal);
9918 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9919 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9920 std::swap(TVal, FVal);
9921 std::swap(CTVal, CFVal);
9922 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9923 } else if (TVal.getOpcode() == ISD::XOR) {
9924 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9925 // with a CSINV rather than a CSEL.
9926 if (isAllOnesConstant(TVal.getOperand(1))) {
9927 std::swap(TVal, FVal);
9928 std::swap(CTVal, CFVal);
9929 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9930 }
9931 } else if (TVal.getOpcode() == ISD::SUB) {
9932 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9933 // that we can match with a CSNEG rather than a CSEL.
9934 if (isNullConstant(TVal.getOperand(0))) {
9935 std::swap(TVal, FVal);
9936 std::swap(CTVal, CFVal);
9937 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9938 }
9939 } else if (CTVal && CFVal) {
9940 const int64_t TrueVal = CTVal->getSExtValue();
9941 const int64_t FalseVal = CFVal->getSExtValue();
9942 bool Swap = false;
9943
9944 // If both TVal and FVal are constants, see if FVal is the
9945 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9946 // instead of a CSEL in that case.
9947 if (TrueVal == ~FalseVal) {
9948 Opcode = AArch64ISD::CSINV;
9949 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9950 TrueVal == -FalseVal) {
9951 Opcode = AArch64ISD::CSNEG;
9952 } else if (TVal.getValueType() == MVT::i32) {
9953 // If our operands are only 32-bit wide, make sure we use 32-bit
9954 // arithmetic for the check whether we can use CSINC. This ensures that
9955 // the addition in the check will wrap around properly in case there is
9956 // an overflow (which would not be the case if we do the check with
9957 // 64-bit arithmetic).
9958 const uint32_t TrueVal32 = CTVal->getZExtValue();
9959 const uint32_t FalseVal32 = CFVal->getZExtValue();
9960
9961 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9962 Opcode = AArch64ISD::CSINC;
9963
9964 if (TrueVal32 > FalseVal32) {
9965 Swap = true;
9966 }
9967 }
9968 } else {
9969 // 64-bit check whether we can use CSINC.
9970 const uint64_t TrueVal64 = TrueVal;
9971 const uint64_t FalseVal64 = FalseVal;
9972
9973 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9974 Opcode = AArch64ISD::CSINC;
9975
9976 if (TrueVal > FalseVal) {
9977 Swap = true;
9978 }
9979 }
9980 }
9981
9982 // Swap TVal and FVal if necessary.
9983 if (Swap) {
9984 std::swap(TVal, FVal);
9985 std::swap(CTVal, CFVal);
9986 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9987 }
9988
9989 if (Opcode != AArch64ISD::CSEL) {
9990 // Drop FVal since we can get its value by simply inverting/negating
9991 // TVal.
9992 FVal = TVal;
9993 }
9994 }
9995
9996 // Avoid materializing a constant when possible by reusing a known value in
9997 // a register. However, don't perform this optimization if the known value
9998 // is one, zero or negative one in the case of a CSEL. We can always
9999 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10000 // FVal, respectively.
10001 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
10002 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10003 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
10005 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10006 // "a != C ? x : a" to avoid materializing C.
10007 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10008 TVal = LHS;
10009 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10010 FVal = LHS;
10011 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10012 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10013 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10014 // avoid materializing C.
10016 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10017 Opcode = AArch64ISD::CSINV;
10018 TVal = LHS;
10019 FVal = DAG.getConstant(0, dl, FVal.getValueType());
10020 }
10021 }
10022
10023 SDValue CCVal;
10024 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10025 EVT VT = TVal.getValueType();
10026 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
10027 }
10028
10029 // Now we know we're dealing with FP values.
10030 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10031 LHS.getValueType() == MVT::f64);
10032 assert(LHS.getValueType() == RHS.getValueType());
10033 EVT VT = TVal.getValueType();
10034 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10035
10036 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10037 // clean. Some of them require two CSELs to implement.
10038 AArch64CC::CondCode CC1, CC2;
10039 changeFPCCToAArch64CC(CC, CC1, CC2);
10040
10041 if (DAG.getTarget().Options.UnsafeFPMath) {
10042 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10043 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10044 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10045 if (RHSVal && RHSVal->isZero()) {
10046 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10047 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10048
10049 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10050 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10051 TVal = LHS;
10052 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10053 CFVal && CFVal->isZero() &&
10054 FVal.getValueType() == LHS.getValueType())
10055 FVal = LHS;
10056 }
10057 }
10058
10059 // Emit first, and possibly only, CSEL.
10060 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10061 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10062
10063 // If we need a second CSEL, emit it, using the output of the first as the
10064 // RHS. We're effectively OR'ing the two CC's together.
10065 if (CC2 != AArch64CC::AL) {
10066 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10067 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10068 }
10069
10070 // Otherwise, return the output of the first CSEL.
10071 return CS1;
10072}
10073
10074SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10075 SelectionDAG &DAG) const {
10076 EVT Ty = Op.getValueType();
10077 auto Idx = Op.getConstantOperandAPInt(2);
10078 int64_t IdxVal = Idx.getSExtValue();
10079 assert(Ty.isScalableVector() &&
10080 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10081
10082 // We can use the splice instruction for certain index values where we are
10083 // able to efficiently generate the correct predicate. The index will be
10084 // inverted and used directly as the input to the ptrue instruction, i.e.
10085 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10086 // splice predicate. However, we can only do this if we can guarantee that
10087 // there are enough elements in the vector, hence we check the index <= min
10088 // number of elements.
10089 std::optional<unsigned> PredPattern;
10090 if (Ty.isScalableVector() && IdxVal < 0 &&
10091 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10092 std::nullopt) {
10093 SDLoc DL(Op);
10094
10095 // Create a predicate where all but the last -IdxVal elements are false.
10096 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10097 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10098 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10099
10100 // Now splice the two inputs together using the predicate.
10101 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10102 Op.getOperand(1));
10103 }
10104
10105 // This will select to an EXT instruction, which has a maximum immediate
10106 // value of 255, hence 2048-bits is the maximum value we can lower.
10107 if (IdxVal >= 0 &&
10108 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
10109 return Op;
10110
10111 return SDValue();
10112}
10113
10114SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10115 SelectionDAG &DAG) const {
10116 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10117 SDValue LHS = Op.getOperand(0);
10118 SDValue RHS = Op.getOperand(1);
10119 SDValue TVal = Op.getOperand(2);
10120 SDValue FVal = Op.getOperand(3);
10121 SDLoc DL(Op);
10122 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10123}
10124
10125SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10126 SelectionDAG &DAG) const {
10127 SDValue CCVal = Op->getOperand(0);
10128 SDValue TVal = Op->getOperand(1);
10129 SDValue FVal = Op->getOperand(2);
10130 SDLoc DL(Op);
10131
10132 EVT Ty = Op.getValueType();
10133 if (Ty == MVT::aarch64svcount) {
10134 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10135 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10136 SDValue Sel =
10137 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10138 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10139 }
10140
10141 if (Ty.isScalableVector()) {
10142 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10143 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10144 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10145 }
10146
10147 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10148 // FIXME: Ideally this would be the same as above using i1 types, however
10149 // for the moment we can't deal with fixed i1 vector types properly, so
10150 // instead extend the predicate to a result type sized integer vector.
10151 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10152 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10153 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10154 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10155 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10156 }
10157
10158 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10159 // instruction.
10160 if (ISD::isOverflowIntrOpRes(CCVal)) {
10161 // Only lower legal XALUO ops.
10162 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10163 return SDValue();
10164
10166 SDValue Value, Overflow;
10167 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10168 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10169
10170 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10171 CCVal, Overflow);
10172 }
10173
10174 // Lower it the same way as we would lower a SELECT_CC node.
10176 SDValue LHS, RHS;
10177 if (CCVal.getOpcode() == ISD::SETCC) {
10178 LHS = CCVal.getOperand(0);
10179 RHS = CCVal.getOperand(1);
10180 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10181 } else {
10182 LHS = CCVal;
10183 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10184 CC = ISD::SETNE;
10185 }
10186
10187 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10188 // order to use FCSELSrrr
10189 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10190 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10191 DAG.getUNDEF(MVT::f32), TVal);
10192 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10193 DAG.getUNDEF(MVT::f32), FVal);
10194 }
10195
10196 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10197
10198 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10199 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10200 }
10201
10202 return Res;
10203}
10204
10205SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10206 SelectionDAG &DAG) const {
10207 // Jump table entries as PC relative offsets. No additional tweaking
10208 // is necessary here. Just get the address of the jump table.
10209 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10210
10213 !Subtarget->isTargetMachO())
10214 return getAddrLarge(JT, DAG);
10215 if (CM == CodeModel::Tiny)
10216 return getAddrTiny(JT, DAG);
10217 return getAddr(JT, DAG);
10218}
10219
10220SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10221 SelectionDAG &DAG) const {
10222 // Jump table entries as PC relative offsets. No additional tweaking
10223 // is necessary here. Just get the address of the jump table.
10224 SDLoc DL(Op);
10225 SDValue JT = Op.getOperand(1);
10226 SDValue Entry = Op.getOperand(2);
10227 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10228
10229 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10230 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10231
10232 SDNode *Dest =
10233 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10234 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10235 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10236 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10237}
10238
10239SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10240 SelectionDAG &DAG) const {
10241 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10243 if (CM == CodeModel::Large) {
10244 // Use the GOT for the large code model on iOS.
10245 if (Subtarget->isTargetMachO()) {
10246 return getGOT(CP, DAG);
10247 }
10249 return getAddrLarge(CP, DAG);
10250 } else if (CM == CodeModel::Tiny) {
10251 return getAddrTiny(CP, DAG);
10252 }
10253 return getAddr(CP, DAG);
10254}
10255
10256SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10257 SelectionDAG &DAG) const {
10258 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10260 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10262 return getAddrLarge(BA, DAG);
10263 } else if (CM == CodeModel::Tiny) {
10264 return getAddrTiny(BA, DAG);
10265 }
10266 return getAddr(BA, DAG);
10267}
10268
10269SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10270 SelectionDAG &DAG) const {
10271 AArch64FunctionInfo *FuncInfo =
10273
10274 SDLoc DL(Op);
10275 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10277 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10278 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10279 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10280 MachinePointerInfo(SV));
10281}
10282
10283SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10284 SelectionDAG &DAG) const {
10287
10288 SDLoc DL(Op);
10289 SDValue FR;
10290 if (Subtarget->isWindowsArm64EC()) {
10291 // With the Arm64EC ABI, we compute the address of the varargs save area
10292 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10293 // but calls from an entry thunk can pass in a different address.
10294 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10295 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10297 if (FuncInfo->getVarArgsGPRSize() > 0)
10298 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10299 else
10300 StackOffset = FuncInfo->getVarArgsStackOffset();
10301 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10302 DAG.getConstant(StackOffset, DL, MVT::i64));
10303 } else {
10304 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10305 ? FuncInfo->getVarArgsGPRIndex()
10306 : FuncInfo->getVarArgsStackIndex(),
10308 }
10309 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10310 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10311 MachinePointerInfo(SV));
10312}
10313
10314SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10315 SelectionDAG &DAG) const {
10316 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10317 // Standard, section B.3.
10320 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10321 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10322 auto PtrVT = getPointerTy(DAG.getDataLayout());
10323 SDLoc DL(Op);
10324
10325 SDValue Chain = Op.getOperand(0);
10326 SDValue VAList = Op.getOperand(1);
10327 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10329
10330 // void *__stack at offset 0
10331 unsigned Offset = 0;
10332 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10333 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10334 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10335 MachinePointerInfo(SV), Align(PtrSize)));
10336
10337 // void *__gr_top at offset 8 (4 on ILP32)
10338 Offset += PtrSize;
10339 int GPRSize = FuncInfo->getVarArgsGPRSize();
10340 if (GPRSize > 0) {
10341 SDValue GRTop, GRTopAddr;
10342
10343 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10344 DAG.getConstant(Offset, DL, PtrVT));
10345
10346 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10347 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10348 DAG.getConstant(GPRSize, DL, PtrVT));
10349 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10350
10351 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10353 Align(PtrSize)));
10354 }
10355
10356 // void *__vr_top at offset 16 (8 on ILP32)
10357 Offset += PtrSize;
10358 int FPRSize = FuncInfo->getVarArgsFPRSize();
10359 if (FPRSize > 0) {
10360 SDValue VRTop, VRTopAddr;
10361 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10362 DAG.getConstant(Offset, DL, PtrVT));
10363
10364 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10365 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10366 DAG.getConstant(FPRSize, DL, PtrVT));
10367 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10368
10369 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10371 Align(PtrSize)));
10372 }
10373
10374 // int __gr_offs at offset 24 (12 on ILP32)
10375 Offset += PtrSize;
10376 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10377 DAG.getConstant(Offset, DL, PtrVT));
10378 MemOps.push_back(
10379 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10380 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10381
10382 // int __vr_offs at offset 28 (16 on ILP32)
10383 Offset += 4;
10384 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10385 DAG.getConstant(Offset, DL, PtrVT));
10386 MemOps.push_back(
10387 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10388 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10389
10390 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10391}
10392
10393SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10394 SelectionDAG &DAG) const {
10396
10397 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10398 return LowerWin64_VASTART(Op, DAG);
10399 else if (Subtarget->isTargetDarwin())
10400 return LowerDarwin_VASTART(Op, DAG);
10401 else
10402 return LowerAAPCS_VASTART(Op, DAG);
10403}
10404
10405SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10406 SelectionDAG &DAG) const {
10407 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10408 // pointer.
10409 SDLoc DL(Op);
10410 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10411 unsigned VaListSize =
10412 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10413 ? PtrSize
10414 : Subtarget->isTargetILP32() ? 20 : 32;
10415 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10416 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10417
10418 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10419 DAG.getConstant(VaListSize, DL, MVT::i32),
10420 Align(PtrSize), false, false, false,
10421 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10422}
10423
10424SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10425 assert(Subtarget->isTargetDarwin() &&
10426 "automatic va_arg instruction only works on Darwin");
10427
10428 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10429 EVT VT = Op.getValueType();
10430 SDLoc DL(Op);
10431 SDValue Chain = Op.getOperand(0);
10432 SDValue Addr = Op.getOperand(1);
10433 MaybeAlign Align(Op.getConstantOperandVal(3));
10434 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10435 auto PtrVT = getPointerTy(DAG.getDataLayout());
10436 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10437 SDValue VAList =
10438 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10439 Chain = VAList.getValue(1);
10440 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10441
10442 if (VT.isScalableVector())
10443 report_fatal_error("Passing SVE types to variadic functions is "
10444 "currently not supported");
10445
10446 if (Align && *Align > MinSlotSize) {
10447 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10448 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10449 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10450 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10451 }
10452
10453 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10454 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10455
10456 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10457 // up to 64 bits. At the very least, we have to increase the striding of the
10458 // vaargs list to match this, and for FP values we need to introduce
10459 // FP_ROUND nodes as well.
10460 if (VT.isInteger() && !VT.isVector())
10461 ArgSize = std::max(ArgSize, MinSlotSize);
10462 bool NeedFPTrunc = false;
10463 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10464 ArgSize = 8;
10465 NeedFPTrunc = true;
10466 }
10467
10468 // Increment the pointer, VAList, to the next vaarg
10469 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10470 DAG.getConstant(ArgSize, DL, PtrVT));
10471 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10472
10473 // Store the incremented VAList to the legalized pointer
10474 SDValue APStore =
10475 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10476
10477 // Load the actual argument out of the pointer VAList
10478 if (NeedFPTrunc) {
10479 // Load the value as an f64.
10480 SDValue WideFP =
10481 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10482 // Round the value down to an f32.
10483 SDValue NarrowFP =
10484 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10485 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10486 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10487 // Merge the rounded value with the chain output of the load.
10488 return DAG.getMergeValues(Ops, DL);
10489 }
10490
10491 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10492}
10493
10494SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10495 SelectionDAG &DAG) const {
10497 MFI.setFrameAddressIsTaken(true);
10498
10499 EVT VT = Op.getValueType();
10500 SDLoc DL(Op);
10501 unsigned Depth = Op.getConstantOperandVal(0);
10502 SDValue FrameAddr =
10503 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10504 while (Depth--)
10505 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10507
10508 if (Subtarget->isTargetILP32())
10509 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10510 DAG.getValueType(VT));
10511
10512 return FrameAddr;
10513}
10514
10515SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10516 SelectionDAG &DAG) const {
10518
10519 EVT VT = getPointerTy(DAG.getDataLayout());
10520 SDLoc DL(Op);
10521 int FI = MFI.CreateFixedObject(4, 0, false);
10522 return DAG.getFrameIndex(FI, VT);
10523}
10524
10525#define GET_REGISTER_MATCHER
10526#include "AArch64GenAsmMatcher.inc"
10527
10528// FIXME? Maybe this could be a TableGen attribute on some registers and
10529// this table could be generated automatically from RegInfo.
10530Register AArch64TargetLowering::
10531getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10533 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10534 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10535 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10536 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10537 !MRI->isReservedReg(MF, Reg))
10538 Reg = 0;
10539 }
10540 if (Reg)
10541 return Reg;
10542 report_fatal_error(Twine("Invalid register name \""
10543 + StringRef(RegName) + "\"."));
10544}
10545
10546SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10547 SelectionDAG &DAG) const {
10549
10550 EVT VT = Op.getValueType();
10551 SDLoc DL(Op);
10552
10553 SDValue FrameAddr =
10554 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10556
10557 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10558}
10559
10560SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10561 SelectionDAG &DAG) const {
10563 MachineFrameInfo &MFI = MF.getFrameInfo();
10564 MFI.setReturnAddressIsTaken(true);
10565
10566 EVT VT = Op.getValueType();
10567 SDLoc DL(Op);
10568 unsigned Depth = Op.getConstantOperandVal(0);
10569 SDValue ReturnAddress;
10570 if (Depth) {
10571 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10573 ReturnAddress = DAG.getLoad(
10574 VT, DL, DAG.getEntryNode(),
10575 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10576 } else {
10577 // Return LR, which contains the return address. Mark it an implicit
10578 // live-in.
10579 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10580 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10581 }
10582
10583 // The XPACLRI instruction assembles to a hint-space instruction before
10584 // Armv8.3-A therefore this instruction can be safely used for any pre
10585 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10586 // that instead.
10587 SDNode *St;
10588 if (Subtarget->hasPAuth()) {
10589 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10590 } else {
10591 // XPACLRI operates on LR therefore we must move the operand accordingly.
10592 SDValue Chain =
10593 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10594 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10595 }
10596 return SDValue(St, 0);
10597}
10598
10599/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10600/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10601SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10602 SelectionDAG &DAG) const {
10603 SDValue Lo, Hi;
10604 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10605 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10606}
10607
10609 const GlobalAddressSDNode *GA) const {
10610 // Offsets are folded in the DAG combine rather than here so that we can
10611 // intelligently choose an offset based on the uses.
10612 return false;
10613}
10614
10616 bool OptForSize) const {
10617 bool IsLegal = false;
10618 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10619 // 16-bit case when target has full fp16 support.
10620 // We encode bf16 bit patterns as if they were fp16. This results in very
10621 // strange looking assembly but should populate the register with appropriate
10622 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10623 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10624 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10625 // FIXME: We should be able to handle f128 as well with a clever lowering.
10626 const APInt ImmInt = Imm.bitcastToAPInt();
10627 if (VT == MVT::f64)
10628 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10629 else if (VT == MVT::f32)
10630 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10631 else if (VT == MVT::f16 || VT == MVT::bf16)
10632 IsLegal =
10633 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10634 Imm.isPosZero();
10635
10636 // If we can not materialize in immediate field for fmov, check if the
10637 // value can be encoded as the immediate operand of a logical instruction.
10638 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10639 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10640 // generate that fmov.
10641 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10642 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10643 // however the mov+fmov sequence is always better because of the reduced
10644 // cache pressure. The timings are still the same if you consider
10645 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10646 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10649 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10650 IsLegal = Insn.size() <= Limit;
10651 }
10652
10653 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10654 << " imm value: "; Imm.dump(););
10655 return IsLegal;
10656}
10657
10658//===----------------------------------------------------------------------===//
10659// AArch64 Optimization Hooks
10660//===----------------------------------------------------------------------===//
10661
10662static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10663 SDValue Operand, SelectionDAG &DAG,
10664 int &ExtraSteps) {
10665 EVT VT = Operand.getValueType();
10666 if ((ST->hasNEON() &&
10667 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10668 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10669 VT == MVT::v4f32)) ||
10670 (ST->hasSVE() &&
10671 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10673 // For the reciprocal estimates, convergence is quadratic, so the number
10674 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10675 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10676 // the result for float (23 mantissa bits) is 2 and for double (52
10677 // mantissa bits) is 3.
10678 constexpr unsigned AccurateBits = 8;
10679 unsigned DesiredBits =
10681 ExtraSteps = DesiredBits <= AccurateBits
10682 ? 0
10683 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10684 }
10685
10686 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10687 }
10688
10689 return SDValue();
10690}
10691
10692SDValue
10693AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10694 const DenormalMode &Mode) const {
10695 SDLoc DL(Op);
10696 EVT VT = Op.getValueType();
10697 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10698 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10699 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10700}
10701
10702SDValue
10703AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10704 SelectionDAG &DAG) const {
10705 return Op;
10706}
10707
10708SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10709 SelectionDAG &DAG, int Enabled,
10710 int &ExtraSteps,
10711 bool &UseOneConst,
10712 bool Reciprocal) const {
10714 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10715 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10716 DAG, ExtraSteps)) {
10717 SDLoc DL(Operand);
10718 EVT VT = Operand.getValueType();
10719
10721 Flags.setAllowReassociation(true);
10722
10723 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10724 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10725 for (int i = ExtraSteps; i > 0; --i) {
10726 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10727 Flags);
10728 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10729 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10730 }
10731 if (!Reciprocal)
10732 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10733
10734 ExtraSteps = 0;
10735 return Estimate;
10736 }
10737
10738 return SDValue();
10739}
10740
10741SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10742 SelectionDAG &DAG, int Enabled,
10743 int &ExtraSteps) const {
10745 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10746 DAG, ExtraSteps)) {
10747 SDLoc DL(Operand);
10748 EVT VT = Operand.getValueType();
10749
10751 Flags.setAllowReassociation(true);
10752
10753 // Newton reciprocal iteration: E * (2 - X * E)
10754 // AArch64 reciprocal iteration instruction: (2 - M * N)
10755 for (int i = ExtraSteps; i > 0; --i) {
10756 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10757 Estimate, Flags);
10758 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10759 }
10760
10761 ExtraSteps = 0;
10762 return Estimate;
10763 }
10764
10765 return SDValue();
10766}
10767
10768//===----------------------------------------------------------------------===//
10769// AArch64 Inline Assembly Support
10770//===----------------------------------------------------------------------===//
10771
10772// Table of Constraints
10773// TODO: This is the current set of constraints supported by ARM for the
10774// compiler, not all of them may make sense.
10775//
10776// r - A general register
10777// w - An FP/SIMD register of some size in the range v0-v31
10778// x - An FP/SIMD register of some size in the range v0-v15
10779// I - Constant that can be used with an ADD instruction
10780// J - Constant that can be used with a SUB instruction
10781// K - Constant that can be used with a 32-bit logical instruction
10782// L - Constant that can be used with a 64-bit logical instruction
10783// M - Constant that can be used as a 32-bit MOV immediate
10784// N - Constant that can be used as a 64-bit MOV immediate
10785// Q - A memory reference with base register and no offset
10786// S - A symbolic address
10787// Y - Floating point constant zero
10788// Z - Integer constant zero
10789//
10790// Note that general register operands will be output using their 64-bit x
10791// register name, whatever the size of the variable, unless the asm operand
10792// is prefixed by the %w modifier. Floating-point and SIMD register operands
10793// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10794// %q modifier.
10795const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10796 // At this point, we have to lower this constraint to something else, so we
10797 // lower it to an "r" or "w". However, by doing this we will force the result
10798 // to be in register, while the X constraint is much more permissive.
10799 //
10800 // Although we are correct (we are free to emit anything, without
10801 // constraints), we might break use cases that would expect us to be more
10802 // efficient and emit something else.
10803 if (!Subtarget->hasFPARMv8())
10804 return "r";
10805
10806 if (ConstraintVT.isFloatingPoint())
10807 return "w";
10808
10809 if (ConstraintVT.isVector() &&
10810 (ConstraintVT.getSizeInBits() == 64 ||
10811 ConstraintVT.getSizeInBits() == 128))
10812 return "w";
10813
10814 return "r";
10815}
10816
10818
10819static std::optional<PredicateConstraint>
10822 .Case("Uph", PredicateConstraint::Uph)
10823 .Case("Upl", PredicateConstraint::Upl)
10824 .Case("Upa", PredicateConstraint::Upa)
10825 .Default(std::nullopt);
10826}
10827
10828static const TargetRegisterClass *
10830 if (VT != MVT::aarch64svcount &&
10831 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10832 return nullptr;
10833
10834 switch (Constraint) {
10835 case PredicateConstraint::Uph:
10836 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10837 : &AArch64::PPR_p8to15RegClass;
10838 case PredicateConstraint::Upl:
10839 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10840 : &AArch64::PPR_3bRegClass;
10841 case PredicateConstraint::Upa:
10842 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10843 : &AArch64::PPRRegClass;
10844 }
10845
10846 llvm_unreachable("Missing PredicateConstraint!");
10847}
10848
10850
10851static std::optional<ReducedGprConstraint>
10854 .Case("Uci", ReducedGprConstraint::Uci)
10855 .Case("Ucj", ReducedGprConstraint::Ucj)
10856 .Default(std::nullopt);
10857}
10858
10859static const TargetRegisterClass *
10861 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10862 return nullptr;
10863
10864 switch (Constraint) {
10865 case ReducedGprConstraint::Uci:
10866 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10867 case ReducedGprConstraint::Ucj:
10868 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10869 }
10870
10871 llvm_unreachable("Missing ReducedGprConstraint!");
10872}
10873
10874// The set of cc code supported is from
10875// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10878 .Case("{@cchi}", AArch64CC::HI)
10879 .Case("{@cccs}", AArch64CC::HS)
10880 .Case("{@cclo}", AArch64CC::LO)
10881 .Case("{@ccls}", AArch64CC::LS)
10882 .Case("{@cccc}", AArch64CC::LO)
10883 .Case("{@cceq}", AArch64CC::EQ)
10884 .Case("{@ccgt}", AArch64CC::GT)
10885 .Case("{@ccge}", AArch64CC::GE)
10886 .Case("{@cclt}", AArch64CC::LT)
10887 .Case("{@ccle}", AArch64CC::LE)
10888 .Case("{@cchs}", AArch64CC::HS)
10889 .Case("{@ccne}", AArch64CC::NE)
10890 .Case("{@ccvc}", AArch64CC::VC)
10891 .Case("{@ccpl}", AArch64CC::PL)
10892 .Case("{@ccvs}", AArch64CC::VS)
10893 .Case("{@ccmi}", AArch64CC::MI)
10895 return Cond;
10896}
10897
10898/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10899/// WZR, invert(<cond>)'.
10901 SelectionDAG &DAG) {
10902 return DAG.getNode(
10903 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10904 DAG.getConstant(0, DL, MVT::i32),
10905 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10906}
10907
10908// Lower @cc flag output via getSETCC.
10909SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10910 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10911 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10912 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10913 if (Cond == AArch64CC::Invalid)
10914 return SDValue();
10915 // The output variable should be a scalar integer.
10916 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10917 OpInfo.ConstraintVT.getSizeInBits() < 8)
10918 report_fatal_error("Flag output operand is of invalid type");
10919
10920 // Get NZCV register. Only update chain when copyfrom is glued.
10921 if (Glue.getNode()) {
10922 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10923 Chain = Glue.getValue(1);
10924 } else
10925 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10926 // Extract CC code.
10927 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10928
10930
10931 // Truncate or ZERO_EXTEND based on value types.
10932 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10933 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10934 else
10935 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10936
10937 return Result;
10938}
10939
10940/// getConstraintType - Given a constraint letter, return the type of
10941/// constraint it is for this target.
10943AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10944 if (Constraint.size() == 1) {
10945 switch (Constraint[0]) {
10946 default:
10947 break;
10948 case 'x':
10949 case 'w':
10950 case 'y':
10951 return C_RegisterClass;
10952 // An address with a single base register. Due to the way we
10953 // currently handle addresses it is the same as 'r'.
10954 case 'Q':
10955 return C_Memory;
10956 case 'I':
10957 case 'J':
10958 case 'K':
10959 case 'L':
10960 case 'M':
10961 case 'N':
10962 case 'Y':
10963 case 'Z':
10964 return C_Immediate;
10965 case 'z':
10966 case 'S': // A symbol or label reference with a constant offset
10967 return C_Other;
10968 }
10969 } else if (parsePredicateConstraint(Constraint))
10970 return C_RegisterClass;
10971 else if (parseReducedGprConstraint(Constraint))
10972 return C_RegisterClass;
10973 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10974 return C_Other;
10975 return TargetLowering::getConstraintType(Constraint);
10976}
10977
10978/// Examine constraint type and operand type and determine a weight value.
10979/// This object must already have been set up with the operand type
10980/// and the current alternative constraint selected.
10982AArch64TargetLowering::getSingleConstraintMatchWeight(
10983 AsmOperandInfo &info, const char *constraint) const {
10985 Value *CallOperandVal = info.CallOperandVal;
10986 // If we don't have a value, we can't do a match,
10987 // but allow it at the lowest weight.
10988 if (!CallOperandVal)
10989 return CW_Default;
10990 Type *type = CallOperandVal->getType();
10991 // Look at the constraint type.
10992 switch (*constraint) {
10993 default:
10995 break;
10996 case 'x':
10997 case 'w':
10998 case 'y':
10999 if (type->isFloatingPointTy() || type->isVectorTy())
11000 weight = CW_Register;
11001 break;
11002 case 'z':
11003 weight = CW_Constant;
11004 break;
11005 case 'U':
11006 if (parsePredicateConstraint(constraint) ||
11007 parseReducedGprConstraint(constraint))
11008 weight = CW_Register;
11009 break;
11010 }
11011 return weight;
11012}
11013
11014std::pair<unsigned, const TargetRegisterClass *>
11015AArch64TargetLowering::getRegForInlineAsmConstraint(
11016 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11017 if (Constraint.size() == 1) {
11018 switch (Constraint[0]) {
11019 case 'r':
11020 if (VT.isScalableVector())
11021 return std::make_pair(0U, nullptr);
11022 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11023 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
11024 if (VT.getFixedSizeInBits() == 64)
11025 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
11026 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
11027 case 'w': {
11028 if (!Subtarget->hasFPARMv8())
11029 break;
11030 if (VT.isScalableVector()) {
11031 if (VT.getVectorElementType() != MVT::i1)
11032 return std::make_pair(0U, &AArch64::ZPRRegClass);
11033 return std::make_pair(0U, nullptr);
11034 }
11035 uint64_t VTSize = VT.getFixedSizeInBits();
11036 if (VTSize == 16)
11037 return std::make_pair(0U, &AArch64::FPR16RegClass);
11038 if (VTSize == 32)
11039 return std::make_pair(0U, &AArch64::FPR32RegClass);
11040 if (VTSize == 64)
11041 return std::make_pair(0U, &AArch64::FPR64RegClass);
11042 if (VTSize == 128)
11043 return std::make_pair(0U, &AArch64::FPR128RegClass);
11044 break;
11045 }
11046 // The instructions that this constraint is designed for can
11047 // only take 128-bit registers so just use that regclass.
11048 case 'x':
11049 if (!Subtarget->hasFPARMv8())
11050 break;
11051 if (VT.isScalableVector())
11052 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11053 if (VT.getSizeInBits() == 128)
11054 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11055 break;
11056 case 'y':
11057 if (!Subtarget->hasFPARMv8())
11058 break;
11059 if (VT.isScalableVector())
11060 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11061 break;
11062 }
11063 } else {
11064 if (const auto PC = parsePredicateConstraint(Constraint))
11065 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11066 return std::make_pair(0U, RegClass);
11067
11068 if (const auto RGC = parseReducedGprConstraint(Constraint))
11069 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11070 return std::make_pair(0U, RegClass);
11071 }
11072 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11074 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11075
11076 if (Constraint == "{za}") {
11077 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11078 }
11079
11080 if (Constraint == "{zt0}") {
11081 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11082 }
11083
11084 // Use the default implementation in TargetLowering to convert the register
11085 // constraint into a member of a register class.
11086 std::pair<unsigned, const TargetRegisterClass *> Res;
11088
11089 // Not found as a standard register?
11090 if (!Res.second) {
11091 unsigned Size = Constraint.size();
11092 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11093 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11094 int RegNo;
11095 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11096 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11097 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11098 // By default we'll emit v0-v31 for this unless there's a modifier where
11099 // we'll emit the correct register as well.
11100 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11101 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11102 Res.second = &AArch64::FPR64RegClass;
11103 } else {
11104 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11105 Res.second = &AArch64::FPR128RegClass;
11106 }
11107 }
11108 }
11109 }
11110
11111 if (Res.second && !Subtarget->hasFPARMv8() &&
11112 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11113 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11114 return std::make_pair(0U, nullptr);
11115
11116 return Res;
11117}
11118
11120 llvm::Type *Ty,
11121 bool AllowUnknown) const {
11122 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11123 return EVT(MVT::i64x8);
11124
11125 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11126}
11127
11128/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11129/// vector. If it is invalid, don't add anything to Ops.
11130void AArch64TargetLowering::LowerAsmOperandForConstraint(
11131 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11132 SelectionDAG &DAG) const {
11133 SDValue Result;
11134
11135 // Currently only support length 1 constraints.
11136 if (Constraint.size() != 1)
11137 return;
11138
11139 char ConstraintLetter = Constraint[0];
11140 switch (ConstraintLetter) {
11141 default:
11142 break;
11143
11144 // This set of constraints deal with valid constants for various instructions.
11145 // Validate and return a target constant for them if we can.
11146 case 'z': {
11147 // 'z' maps to xzr or wzr so it needs an input of 0.
11148 if (!isNullConstant(Op))
11149 return;
11150
11151 if (Op.getValueType() == MVT::i64)
11152 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11153 else
11154 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11155 break;
11156 }
11157 case 'S':
11158 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11159 // supported for PIC while "s" isn't, making "s" less useful. We implement
11160 // "S" but not "s".
11162 break;
11163
11164 case 'I':
11165 case 'J':
11166 case 'K':
11167 case 'L':
11168 case 'M':
11169 case 'N':
11170 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11171 if (!C)
11172 return;
11173
11174 // Grab the value and do some validation.
11175 uint64_t CVal = C->getZExtValue();
11176 switch (ConstraintLetter) {
11177 // The I constraint applies only to simple ADD or SUB immediate operands:
11178 // i.e. 0 to 4095 with optional shift by 12
11179 // The J constraint applies only to ADD or SUB immediates that would be
11180 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11181 // instruction [or vice versa], in other words -1 to -4095 with optional
11182 // left shift by 12.
11183 case 'I':
11184 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11185 break;
11186 return;
11187 case 'J': {
11188 uint64_t NVal = -C->getSExtValue();
11189 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11190 CVal = C->getSExtValue();
11191 break;
11192 }
11193 return;
11194 }
11195 // The K and L constraints apply *only* to logical immediates, including
11196 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11197 // been removed and MOV should be used). So these constraints have to
11198 // distinguish between bit patterns that are valid 32-bit or 64-bit
11199 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11200 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11201 // versa.
11202 case 'K':
11203 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11204 break;
11205 return;
11206 case 'L':
11207 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11208 break;
11209 return;
11210 // The M and N constraints are a superset of K and L respectively, for use
11211 // with the MOV (immediate) alias. As well as the logical immediates they
11212 // also match 32 or 64-bit immediates that can be loaded either using a
11213 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11214 // (M) or 64-bit 0x1234000000000000 (N) etc.
11215 // As a note some of this code is liberally stolen from the asm parser.
11216 case 'M': {
11217 if (!isUInt<32>(CVal))
11218 return;
11219 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11220 break;
11221 if ((CVal & 0xFFFF) == CVal)
11222 break;
11223 if ((CVal & 0xFFFF0000ULL) == CVal)
11224 break;
11225 uint64_t NCVal = ~(uint32_t)CVal;
11226 if ((NCVal & 0xFFFFULL) == NCVal)
11227 break;
11228 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11229 break;
11230 return;
11231 }
11232 case 'N': {
11233 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11234 break;
11235 if ((CVal & 0xFFFFULL) == CVal)
11236 break;
11237 if ((CVal & 0xFFFF0000ULL) == CVal)
11238 break;
11239 if ((CVal & 0xFFFF00000000ULL) == CVal)
11240 break;
11241 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11242 break;
11243 uint64_t NCVal = ~CVal;
11244 if ((NCVal & 0xFFFFULL) == NCVal)
11245 break;
11246 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11247 break;
11248 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11249 break;
11250 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11251 break;
11252 return;
11253 }
11254 default:
11255 return;
11256 }
11257
11258 // All assembler immediates are 64-bit integers.
11259 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11260 break;
11261 }
11262
11263 if (Result.getNode()) {
11264 Ops.push_back(Result);
11265 return;
11266 }
11267
11268 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11269}
11270
11271//===----------------------------------------------------------------------===//
11272// AArch64 Advanced SIMD Support
11273//===----------------------------------------------------------------------===//
11274
11275/// WidenVector - Given a value in the V64 register class, produce the
11276/// equivalent value in the V128 register class.
11278 EVT VT = V64Reg.getValueType();
11279 unsigned NarrowSize = VT.getVectorNumElements();
11280 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11281 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11282 SDLoc DL(V64Reg);
11283
11284 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11285 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11286}
11287
11288/// getExtFactor - Determine the adjustment factor for the position when
11289/// generating an "extract from vector registers" instruction.
11290static unsigned getExtFactor(SDValue &V) {
11291 EVT EltType = V.getValueType().getVectorElementType();
11292 return EltType.getSizeInBits() / 8;
11293}
11294
11295// Check if a vector is built from one vector via extracted elements of
11296// another together with an AND mask, ensuring that all elements fit
11297// within range. This can be reconstructed using AND and NEON's TBL1.
11299 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11300 SDLoc dl(Op);
11301 EVT VT = Op.getValueType();
11302 assert(!VT.isScalableVector() &&
11303 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11304
11305 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11306 // directly to TBL1.
11307 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11308 return SDValue();
11309
11310 unsigned NumElts = VT.getVectorNumElements();
11311 assert((NumElts == 8 || NumElts == 16) &&
11312 "Need to have exactly 8 or 16 elements in vector.");
11313
11314 SDValue SourceVec;
11315 SDValue MaskSourceVec;
11316 SmallVector<SDValue, 16> AndMaskConstants;
11317
11318 for (unsigned i = 0; i < NumElts; ++i) {
11319 SDValue V = Op.getOperand(i);
11320 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11321 return SDValue();
11322
11323 SDValue OperandSourceVec = V.getOperand(0);
11324 if (!SourceVec)
11325 SourceVec = OperandSourceVec;
11326 else if (SourceVec != OperandSourceVec)
11327 return SDValue();
11328
11329 // This only looks at shuffles with elements that are
11330 // a) truncated by a constant AND mask extracted from a mask vector, or
11331 // b) extracted directly from a mask vector.
11332 SDValue MaskSource = V.getOperand(1);
11333 if (MaskSource.getOpcode() == ISD::AND) {
11334 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11335 return SDValue();
11336
11337 AndMaskConstants.push_back(MaskSource.getOperand(1));
11338 MaskSource = MaskSource->getOperand(0);
11339 } else if (!AndMaskConstants.empty()) {
11340 // Either all or no operands should have an AND mask.
11341 return SDValue();
11342 }
11343
11344 // An ANY_EXTEND may be inserted between the AND and the source vector
11345 // extraction. We don't care about that, so we can just skip it.
11346 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11347 MaskSource = MaskSource.getOperand(0);
11348
11349 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11350 return SDValue();
11351
11352 SDValue MaskIdx = MaskSource.getOperand(1);
11353 if (!isa<ConstantSDNode>(MaskIdx) ||
11354 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11355 return SDValue();
11356
11357 // We only apply this if all elements come from the same vector with the
11358 // same vector type.
11359 if (!MaskSourceVec) {
11360 MaskSourceVec = MaskSource->getOperand(0);
11361 if (MaskSourceVec.getValueType() != VT)
11362 return SDValue();
11363 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11364 return SDValue();
11365 }
11366 }
11367
11368 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11369 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11370 // insert, we know that the index in the mask must be smaller than the number
11371 // of elements in the source, or we would have an out-of-bounds access.
11372 if (NumElts == 8)
11373 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11374 DAG.getUNDEF(VT));
11375
11376 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11377 if (!AndMaskConstants.empty())
11378 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11379 DAG.getBuildVector(VT, dl, AndMaskConstants));
11380
11381 return DAG.getNode(
11383 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11384 MaskSourceVec);
11385}
11386
11387// Gather data to see if the operation can be modelled as a
11388// shuffle in combination with VEXTs.
11390 SelectionDAG &DAG) const {
11391 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11392 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11393 SDLoc dl(Op);
11394 EVT VT = Op.getValueType();
11395 assert(!VT.isScalableVector() &&
11396 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11397 unsigned NumElts = VT.getVectorNumElements();
11398
11399 struct ShuffleSourceInfo {
11400 SDValue Vec;
11401 unsigned MinElt;
11402 unsigned MaxElt;
11403
11404 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11405 // be compatible with the shuffle we intend to construct. As a result
11406 // ShuffleVec will be some sliding window into the original Vec.
11407 SDValue ShuffleVec;
11408
11409 // Code should guarantee that element i in Vec starts at element "WindowBase
11410 // + i * WindowScale in ShuffleVec".
11411 int WindowBase;
11412 int WindowScale;
11413
11414 ShuffleSourceInfo(SDValue Vec)
11415 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11416 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11417
11418 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11419 };
11420
11421 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11422 // node.
11424 for (unsigned i = 0; i < NumElts; ++i) {
11425 SDValue V = Op.getOperand(i);
11426 if (V.isUndef())
11427 continue;
11428 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11429 !isa<ConstantSDNode>(V.getOperand(1)) ||
11430 V.getOperand(0).getValueType().isScalableVector()) {
11431 LLVM_DEBUG(
11432 dbgs() << "Reshuffle failed: "
11433 "a shuffle can only come from building a vector from "
11434 "various elements of other fixed-width vectors, provided "
11435 "their indices are constant\n");
11436 return SDValue();
11437 }
11438
11439 // Add this element source to the list if it's not already there.
11440 SDValue SourceVec = V.getOperand(0);
11441 auto Source = find(Sources, SourceVec);
11442 if (Source == Sources.end())
11443 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11444
11445 // Update the minimum and maximum lane number seen.
11446 unsigned EltNo = V.getConstantOperandVal(1);
11447 Source->MinElt = std::min(Source->MinElt, EltNo);
11448 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11449 }
11450
11451 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11452 // better than moving to/from gpr registers for larger vectors.
11453 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11454 // Construct a mask for the tbl. We may need to adjust the index for types
11455 // larger than i8.
11457 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11458 for (unsigned I = 0; I < NumElts; ++I) {
11459 SDValue V = Op.getOperand(I);
11460 if (V.isUndef()) {
11461 for (unsigned OF = 0; OF < OutputFactor; OF++)
11462 Mask.push_back(-1);
11463 continue;
11464 }
11465 // Set the Mask lanes adjusted for the size of the input and output
11466 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11467 // output element, adjusted in their positions per input and output types.
11468 unsigned Lane = V.getConstantOperandVal(1);
11469 for (unsigned S = 0; S < Sources.size(); S++) {
11470 if (V.getOperand(0) == Sources[S].Vec) {
11471 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11472 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11473 for (unsigned OF = 0; OF < OutputFactor; OF++)
11474 Mask.push_back(InputBase + OF);
11475 break;
11476 }
11477 }
11478 }
11479
11480 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11481 // v16i8, and the TBLMask
11482 SmallVector<SDValue, 16> TBLOperands;
11483 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11484 ? Intrinsic::aarch64_neon_tbl3
11485 : Intrinsic::aarch64_neon_tbl4,
11486 dl, MVT::i32));
11487 for (unsigned i = 0; i < Sources.size(); i++) {
11488 SDValue Src = Sources[i].Vec;
11489 EVT SrcVT = Src.getValueType();
11490 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11491 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11492 "Expected a legally typed vector");
11493 if (SrcVT.is64BitVector())
11494 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11495 DAG.getUNDEF(MVT::v8i8));
11496 TBLOperands.push_back(Src);
11497 }
11498
11500 for (unsigned i = 0; i < Mask.size(); i++)
11501 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11502 assert((Mask.size() == 8 || Mask.size() == 16) &&
11503 "Expected a v8i8 or v16i8 Mask");
11504 TBLOperands.push_back(
11505 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11506
11507 SDValue Shuffle =
11509 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11510 return DAG.getBitcast(VT, Shuffle);
11511 }
11512
11513 if (Sources.size() > 2) {
11514 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11515 << "sensible when at most two source vectors are "
11516 << "involved\n");
11517 return SDValue();
11518 }
11519
11520 // Find out the smallest element size among result and two sources, and use
11521 // it as element size to build the shuffle_vector.
11522 EVT SmallestEltTy = VT.getVectorElementType();
11523 for (auto &Source : Sources) {
11524 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11525 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11526 SmallestEltTy = SrcEltTy;
11527 }
11528 }
11529 unsigned ResMultiplier =
11530 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11531 uint64_t VTSize = VT.getFixedSizeInBits();
11532 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11533 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11534
11535 // If the source vector is too wide or too narrow, we may nevertheless be able
11536 // to construct a compatible shuffle either by concatenating it with UNDEF or
11537 // extracting a suitable range of elements.
11538 for (auto &Src : Sources) {
11539 EVT SrcVT = Src.ShuffleVec.getValueType();
11540
11541 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11542 if (SrcVTSize == TypeSize::getFixed(VTSize))
11543 continue;
11544
11545 // This stage of the search produces a source with the same element type as
11546 // the original, but with a total width matching the BUILD_VECTOR output.
11547 EVT EltVT = SrcVT.getVectorElementType();
11548 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11549 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11550
11551 if (SrcVTSize.getFixedValue() < VTSize) {
11552 assert(2 * SrcVTSize == VTSize);
11553 // We can pad out the smaller vector for free, so if it's part of a
11554 // shuffle...
11555 Src.ShuffleVec =
11556 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11557 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11558 continue;
11559 }
11560
11561 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11562 LLVM_DEBUG(
11563 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11564 return SDValue();
11565 }
11566
11567 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11568 LLVM_DEBUG(
11569 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11570 return SDValue();
11571 }
11572
11573 if (Src.MinElt >= NumSrcElts) {
11574 // The extraction can just take the second half
11575 Src.ShuffleVec =
11576 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11577 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11578 Src.WindowBase = -NumSrcElts;
11579 } else if (Src.MaxElt < NumSrcElts) {
11580 // The extraction can just take the first half
11581 Src.ShuffleVec =
11582 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11583 DAG.getConstant(0, dl, MVT::i64));
11584 } else {
11585 // An actual VEXT is needed
11586 SDValue VEXTSrc1 =
11587 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11588 DAG.getConstant(0, dl, MVT::i64));
11589 SDValue VEXTSrc2 =
11590 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11591 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11592 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11593
11594 if (!SrcVT.is64BitVector()) {
11595 LLVM_DEBUG(
11596 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11597 "for SVE vectors.");
11598 return SDValue();
11599 }
11600
11601 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11602 VEXTSrc2,
11603 DAG.getConstant(Imm, dl, MVT::i32));
11604 Src.WindowBase = -Src.MinElt;
11605 }
11606 }
11607
11608 // Another possible incompatibility occurs from the vector element types. We
11609 // can fix this by bitcasting the source vectors to the same type we intend
11610 // for the shuffle.
11611 for (auto &Src : Sources) {
11612 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11613 if (SrcEltTy == SmallestEltTy)
11614 continue;
11615 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11616 if (DAG.getDataLayout().isBigEndian()) {
11617 Src.ShuffleVec =
11618 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11619 } else {
11620 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11621 }
11622 Src.WindowScale =
11623 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11624 Src.WindowBase *= Src.WindowScale;
11625 }
11626
11627 // Final check before we try to actually produce a shuffle.
11628 LLVM_DEBUG(for (auto Src
11629 : Sources)
11630 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11631
11632 // The stars all align, our next step is to produce the mask for the shuffle.
11633 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11634 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11635 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11636 SDValue Entry = Op.getOperand(i);
11637 if (Entry.isUndef())
11638 continue;
11639
11640 auto Src = find(Sources, Entry.getOperand(0));
11641 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11642
11643 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11644 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11645 // segment.
11646 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11647 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11648 VT.getScalarSizeInBits());
11649 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11650
11651 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11652 // starting at the appropriate offset.
11653 int *LaneMask = &Mask[i * ResMultiplier];
11654
11655 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11656 ExtractBase += NumElts * (Src - Sources.begin());
11657 for (int j = 0; j < LanesDefined; ++j)
11658 LaneMask[j] = ExtractBase + j;
11659 }
11660
11661 // Final check before we try to produce nonsense...
11662 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11663 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11664 return SDValue();
11665 }
11666
11667 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11668 for (unsigned i = 0; i < Sources.size(); ++i)
11669 ShuffleOps[i] = Sources[i].ShuffleVec;
11670
11671 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11672 ShuffleOps[1], Mask);
11673 SDValue V;
11674 if (DAG.getDataLayout().isBigEndian()) {
11675 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11676 } else {
11677 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11678 }
11679
11680 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11681 dbgs() << "Reshuffle, creating node: "; V.dump(););
11682
11683 return V;
11684}
11685
11686// check if an EXT instruction can handle the shuffle mask when the
11687// vector sources of the shuffle are the same.
11688static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11689 unsigned NumElts = VT.getVectorNumElements();
11690
11691 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11692 if (M[0] < 0)
11693 return false;
11694
11695 Imm = M[0];
11696
11697 // If this is a VEXT shuffle, the immediate value is the index of the first
11698 // element. The other shuffle indices must be the successive elements after
11699 // the first one.
11700 unsigned ExpectedElt = Imm;
11701 for (unsigned i = 1; i < NumElts; ++i) {
11702 // Increment the expected index. If it wraps around, just follow it
11703 // back to index zero and keep going.
11704 ++ExpectedElt;
11705 if (ExpectedElt == NumElts)
11706 ExpectedElt = 0;
11707
11708 if (M[i] < 0)
11709 continue; // ignore UNDEF indices
11710 if (ExpectedElt != static_cast<unsigned>(M[i]))
11711 return false;
11712 }
11713
11714 return true;
11715}
11716
11717// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11718// v4i32s. This is really a truncate, which we can construct out of (legal)
11719// concats and truncate nodes.
11721 if (V.getValueType() != MVT::v16i8)
11722 return SDValue();
11723 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11724
11725 for (unsigned X = 0; X < 4; X++) {
11726 // Check the first item in each group is an extract from lane 0 of a v4i32
11727 // or v4i16.
11728 SDValue BaseExt = V.getOperand(X * 4);
11729 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11730 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11731 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11732 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11733 BaseExt.getConstantOperandVal(1) != 0)
11734 return SDValue();
11735 SDValue Base = BaseExt.getOperand(0);
11736 // And check the other items are extracts from the same vector.
11737 for (unsigned Y = 1; Y < 4; Y++) {
11738 SDValue Ext = V.getOperand(X * 4 + Y);
11739 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11740 Ext.getOperand(0) != Base ||
11741 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11742 Ext.getConstantOperandVal(1) != Y)
11743 return SDValue();
11744 }
11745 }
11746
11747 // Turn the buildvector into a series of truncates and concates, which will
11748 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11749 // concat together to produce 2 v8i16. These are both truncated and concat
11750 // together.
11751 SDLoc DL(V);
11752 SDValue Trunc[4] = {
11753 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11754 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11755 for (SDValue &V : Trunc)
11756 if (V.getValueType() == MVT::v4i32)
11757 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11758 SDValue Concat0 =
11759 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11760 SDValue Concat1 =
11761 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11762 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11763 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11764 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11765}
11766
11767/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11768/// element width than the vector lane type. If that is the case the function
11769/// returns true and writes the value of the DUP instruction lane operand into
11770/// DupLaneOp
11771static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11772 unsigned &DupLaneOp) {
11773 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11774 "Only possible block sizes for wide DUP are: 16, 32, 64");
11775
11776 if (BlockSize <= VT.getScalarSizeInBits())
11777 return false;
11778 if (BlockSize % VT.getScalarSizeInBits() != 0)
11779 return false;
11780 if (VT.getSizeInBits() % BlockSize != 0)
11781 return false;
11782
11783 size_t SingleVecNumElements = VT.getVectorNumElements();
11784 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11785 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11786
11787 // We are looking for masks like
11788 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11789 // might be replaced by 'undefined'. BlockIndices will eventually contain
11790 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11791 // for the above examples)
11792 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11793 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11794 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11795 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11796 if (Elt < 0)
11797 continue;
11798 // For now we don't support shuffles that use the second operand
11799 if ((unsigned)Elt >= SingleVecNumElements)
11800 return false;
11801 if (BlockElts[I] < 0)
11802 BlockElts[I] = Elt;
11803 else if (BlockElts[I] != Elt)
11804 return false;
11805 }
11806
11807 // We found a candidate block (possibly with some undefs). It must be a
11808 // sequence of consecutive integers starting with a value divisible by
11809 // NumEltsPerBlock with some values possibly replaced by undef-s.
11810
11811 // Find first non-undef element
11812 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11813 assert(FirstRealEltIter != BlockElts.end() &&
11814 "Shuffle with all-undefs must have been caught by previous cases, "
11815 "e.g. isSplat()");
11816 if (FirstRealEltIter == BlockElts.end()) {
11817 DupLaneOp = 0;
11818 return true;
11819 }
11820
11821 // Index of FirstRealElt in BlockElts
11822 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11823
11824 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11825 return false;
11826 // BlockElts[0] must have the following value if it isn't undef:
11827 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11828
11829 // Check the first element
11830 if (Elt0 % NumEltsPerBlock != 0)
11831 return false;
11832 // Check that the sequence indeed consists of consecutive integers (modulo
11833 // undefs)
11834 for (size_t I = 0; I < NumEltsPerBlock; I++)
11835 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11836 return false;
11837
11838 DupLaneOp = Elt0 / NumEltsPerBlock;
11839 return true;
11840}
11841
11842// check if an EXT instruction can handle the shuffle mask when the
11843// vector sources of the shuffle are different.
11844static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11845 unsigned &Imm) {
11846 // Look for the first non-undef element.
11847 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11848
11849 // Benefit form APInt to handle overflow when calculating expected element.
11850 unsigned NumElts = VT.getVectorNumElements();
11851 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11852 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11853 // The following shuffle indices must be the successive elements after the
11854 // first real element.
11855 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11856 return Elt != ExpectedElt++ && Elt != -1;
11857 });
11858 if (FoundWrongElt)
11859 return false;
11860
11861 // The index of an EXT is the first element if it is not UNDEF.
11862 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11863 // value of the first element. E.g.
11864 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11865 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11866 // ExpectedElt is the last mask index plus 1.
11867 Imm = ExpectedElt.getZExtValue();
11868
11869 // There are two difference cases requiring to reverse input vectors.
11870 // For example, for vector <4 x i32> we have the following cases,
11871 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11872 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11873 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11874 // to reverse two input vectors.
11875 if (Imm < NumElts)
11876 ReverseEXT = true;
11877 else
11878 Imm -= NumElts;
11879
11880 return true;
11881}
11882
11883/// isREVMask - Check if a vector shuffle corresponds to a REV
11884/// instruction with the specified blocksize. (The order of the elements
11885/// within each block of the vector is reversed.)
11886static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11887 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11888 BlockSize == 128) &&
11889 "Only possible block sizes for REV are: 16, 32, 64, 128");
11890
11891 unsigned EltSz = VT.getScalarSizeInBits();
11892 unsigned NumElts = VT.getVectorNumElements();
11893 unsigned BlockElts = M[0] + 1;
11894 // If the first shuffle index is UNDEF, be optimistic.
11895 if (M[0] < 0)
11896 BlockElts = BlockSize / EltSz;
11897
11898 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11899 return false;
11900
11901 for (unsigned i = 0; i < NumElts; ++i) {
11902 if (M[i] < 0)
11903 continue; // ignore UNDEF indices
11904 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11905 return false;
11906 }
11907
11908 return true;
11909}
11910
11911static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11912 unsigned NumElts = VT.getVectorNumElements();
11913 if (NumElts % 2 != 0)
11914 return false;
11915 WhichResult = (M[0] == 0 ? 0 : 1);
11916 for (unsigned i = 0; i < NumElts; i += 2) {
11917 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11918 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11919 return false;
11920 }
11921 return true;
11922}
11923
11924/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11925/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11926/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11927static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11928 unsigned NumElts = VT.getVectorNumElements();
11929 if (NumElts % 2 != 0)
11930 return false;
11931 WhichResult = (M[0] == 0 ? 0 : 1);
11932 unsigned Idx = WhichResult * NumElts / 2;
11933 for (unsigned i = 0; i != NumElts; i += 2) {
11934 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11935 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11936 return false;
11937 Idx += 1;
11938 }
11939
11940 return true;
11941}
11942
11943/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11944/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11945/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11946static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11947 unsigned Half = VT.getVectorNumElements() / 2;
11948 WhichResult = (M[0] == 0 ? 0 : 1);
11949 for (unsigned j = 0; j != 2; ++j) {
11950 unsigned Idx = WhichResult;
11951 for (unsigned i = 0; i != Half; ++i) {
11952 int MIdx = M[i + j * Half];
11953 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11954 return false;
11955 Idx += 2;
11956 }
11957 }
11958
11959 return true;
11960}
11961
11962/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11963/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11964/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11965static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11966 unsigned NumElts = VT.getVectorNumElements();
11967 if (NumElts % 2 != 0)
11968 return false;
11969 WhichResult = (M[0] == 0 ? 0 : 1);
11970 for (unsigned i = 0; i < NumElts; i += 2) {
11971 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11972 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11973 return false;
11974 }
11975 return true;
11976}
11977
11978static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11979 bool &DstIsLeft, int &Anomaly) {
11980 if (M.size() != static_cast<size_t>(NumInputElements))
11981 return false;
11982
11983 int NumLHSMatch = 0, NumRHSMatch = 0;
11984 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11985
11986 for (int i = 0; i < NumInputElements; ++i) {
11987 if (M[i] == -1) {
11988 ++NumLHSMatch;
11989 ++NumRHSMatch;
11990 continue;
11991 }
11992
11993 if (M[i] == i)
11994 ++NumLHSMatch;
11995 else
11996 LastLHSMismatch = i;
11997
11998 if (M[i] == i + NumInputElements)
11999 ++NumRHSMatch;
12000 else
12001 LastRHSMismatch = i;
12002 }
12003
12004 if (NumLHSMatch == NumInputElements - 1) {
12005 DstIsLeft = true;
12006 Anomaly = LastLHSMismatch;
12007 return true;
12008 } else if (NumRHSMatch == NumInputElements - 1) {
12009 DstIsLeft = false;
12010 Anomaly = LastRHSMismatch;
12011 return true;
12012 }
12013
12014 return false;
12015}
12016
12017static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12018 if (VT.getSizeInBits() != 128)
12019 return false;
12020
12021 unsigned NumElts = VT.getVectorNumElements();
12022
12023 for (int I = 0, E = NumElts / 2; I != E; I++) {
12024 if (Mask[I] != I)
12025 return false;
12026 }
12027
12028 int Offset = NumElts / 2;
12029 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12030 if (Mask[I] != I + SplitLHS * Offset)
12031 return false;
12032 }
12033
12034 return true;
12035}
12036
12038 SDLoc DL(Op);
12039 EVT VT = Op.getValueType();
12040 SDValue V0 = Op.getOperand(0);
12041 SDValue V1 = Op.getOperand(1);
12042 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12043
12046 return SDValue();
12047
12048 bool SplitV0 = V0.getValueSizeInBits() == 128;
12049
12050 if (!isConcatMask(Mask, VT, SplitV0))
12051 return SDValue();
12052
12053 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12054 if (SplitV0) {
12055 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12056 DAG.getConstant(0, DL, MVT::i64));
12057 }
12058 if (V1.getValueSizeInBits() == 128) {
12059 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12060 DAG.getConstant(0, DL, MVT::i64));
12061 }
12062 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12063}
12064
12065/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12066/// the specified operations to build the shuffle. ID is the perfect-shuffle
12067//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12068//table entry and LHS/RHS are the immediate inputs for this stage of the
12069//shuffle.
12071 SDValue V2, unsigned PFEntry, SDValue LHS,
12072 SDValue RHS, SelectionDAG &DAG,
12073 const SDLoc &dl) {
12074 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12075 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12076 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12077
12078 enum {
12079 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12080 OP_VREV,
12081 OP_VDUP0,
12082 OP_VDUP1,
12083 OP_VDUP2,
12084 OP_VDUP3,
12085 OP_VEXT1,
12086 OP_VEXT2,
12087 OP_VEXT3,
12088 OP_VUZPL, // VUZP, left result
12089 OP_VUZPR, // VUZP, right result
12090 OP_VZIPL, // VZIP, left result
12091 OP_VZIPR, // VZIP, right result
12092 OP_VTRNL, // VTRN, left result
12093 OP_VTRNR, // VTRN, right result
12094 OP_MOVLANE // Move lane. RHSID is the lane to move into
12095 };
12096
12097 if (OpNum == OP_COPY) {
12098 if (LHSID == (1 * 9 + 2) * 9 + 3)
12099 return LHS;
12100 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12101 return RHS;
12102 }
12103
12104 if (OpNum == OP_MOVLANE) {
12105 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12106 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12107 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12108 Elt = 3 - Elt;
12109 while (Elt > 0) {
12110 ID /= 9;
12111 Elt--;
12112 }
12113 return (ID % 9 == 8) ? -1 : ID % 9;
12114 };
12115
12116 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12117 // get the lane to move from the PFID, which is always from the
12118 // original vectors (V1 or V2).
12120 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12121 EVT VT = OpLHS.getValueType();
12122 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12123 unsigned ExtLane = 0;
12124 SDValue Input;
12125
12126 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12127 // convert into a higher type.
12128 if (RHSID & 0x4) {
12129 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12130 if (MaskElt == -1)
12131 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12132 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12133 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12134 Input = MaskElt < 2 ? V1 : V2;
12135 if (VT.getScalarSizeInBits() == 16) {
12136 Input = DAG.getBitcast(MVT::v2f32, Input);
12137 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12138 } else {
12139 assert(VT.getScalarSizeInBits() == 32 &&
12140 "Expected 16 or 32 bit shuffle elemements");
12141 Input = DAG.getBitcast(MVT::v2f64, Input);
12142 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12143 }
12144 } else {
12145 int MaskElt = getPFIDLane(ID, RHSID);
12146 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12147 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12148 Input = MaskElt < 4 ? V1 : V2;
12149 // Be careful about creating illegal types. Use f16 instead of i16.
12150 if (VT == MVT::v4i16) {
12151 Input = DAG.getBitcast(MVT::v4f16, Input);
12152 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12153 }
12154 }
12157 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12158 SDValue Ins =
12159 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12160 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12161 return DAG.getBitcast(VT, Ins);
12162 }
12163
12164 SDValue OpLHS, OpRHS;
12165 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12166 RHS, DAG, dl);
12167 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12168 RHS, DAG, dl);
12169 EVT VT = OpLHS.getValueType();
12170
12171 switch (OpNum) {
12172 default:
12173 llvm_unreachable("Unknown shuffle opcode!");
12174 case OP_VREV:
12175 // VREV divides the vector in half and swaps within the half.
12176 if (VT.getVectorElementType() == MVT::i32 ||
12177 VT.getVectorElementType() == MVT::f32)
12178 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12179 // vrev <4 x i16> -> REV32
12180 if (VT.getVectorElementType() == MVT::i16 ||
12181 VT.getVectorElementType() == MVT::f16 ||
12182 VT.getVectorElementType() == MVT::bf16)
12183 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12184 // vrev <4 x i8> -> REV16
12185 assert(VT.getVectorElementType() == MVT::i8);
12186 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12187 case OP_VDUP0:
12188 case OP_VDUP1:
12189 case OP_VDUP2:
12190 case OP_VDUP3: {
12191 EVT EltTy = VT.getVectorElementType();
12192 unsigned Opcode;
12193 if (EltTy == MVT::i8)
12194 Opcode = AArch64ISD::DUPLANE8;
12195 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12196 Opcode = AArch64ISD::DUPLANE16;
12197 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12198 Opcode = AArch64ISD::DUPLANE32;
12199 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12200 Opcode = AArch64ISD::DUPLANE64;
12201 else
12202 llvm_unreachable("Invalid vector element type?");
12203
12204 if (VT.getSizeInBits() == 64)
12205 OpLHS = WidenVector(OpLHS, DAG);
12206 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12207 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12208 }
12209 case OP_VEXT1:
12210 case OP_VEXT2:
12211 case OP_VEXT3: {
12212 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12213 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12214 DAG.getConstant(Imm, dl, MVT::i32));
12215 }
12216 case OP_VUZPL:
12217 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12218 case OP_VUZPR:
12219 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12220 case OP_VZIPL:
12221 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12222 case OP_VZIPR:
12223 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12224 case OP_VTRNL:
12225 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12226 case OP_VTRNR:
12227 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12228 }
12229}
12230
12232 SelectionDAG &DAG) {
12233 // Check to see if we can use the TBL instruction.
12234 SDValue V1 = Op.getOperand(0);
12235 SDValue V2 = Op.getOperand(1);
12236 SDLoc DL(Op);
12237
12238 EVT EltVT = Op.getValueType().getVectorElementType();
12239 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12240
12241 bool Swap = false;
12242 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12243 std::swap(V1, V2);
12244 Swap = true;
12245 }
12246
12247 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12248 // out of range values with 0s. We do need to make sure that any out-of-range
12249 // values are really out-of-range for a v16i8 vector.
12250 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12251 MVT IndexVT = MVT::v8i8;
12252 unsigned IndexLen = 8;
12253 if (Op.getValueSizeInBits() == 128) {
12254 IndexVT = MVT::v16i8;
12255 IndexLen = 16;
12256 }
12257
12259 for (int Val : ShuffleMask) {
12260 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12261 unsigned Offset = Byte + Val * BytesPerElt;
12262 if (Swap)
12263 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12264 if (IsUndefOrZero && Offset >= IndexLen)
12265 Offset = 255;
12266 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12267 }
12268 }
12269
12270 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12271 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12272
12273 SDValue Shuffle;
12274 if (IsUndefOrZero) {
12275 if (IndexLen == 8)
12276 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12277 Shuffle = DAG.getNode(
12278 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12279 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12280 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12281 } else {
12282 if (IndexLen == 8) {
12283 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12284 Shuffle = DAG.getNode(
12285 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12286 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12287 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12288 } else {
12289 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12290 // cannot currently represent the register constraints on the input
12291 // table registers.
12292 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12293 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12294 // IndexLen));
12295 Shuffle = DAG.getNode(
12296 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12297 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12298 V2Cst,
12299 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12300 }
12301 }
12302 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12303}
12304
12305static unsigned getDUPLANEOp(EVT EltType) {
12306 if (EltType == MVT::i8)
12307 return AArch64ISD::DUPLANE8;
12308 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12309 return AArch64ISD::DUPLANE16;
12310 if (EltType == MVT::i32 || EltType == MVT::f32)
12311 return AArch64ISD::DUPLANE32;
12312 if (EltType == MVT::i64 || EltType == MVT::f64)
12313 return AArch64ISD::DUPLANE64;
12314
12315 llvm_unreachable("Invalid vector element type?");
12316}
12317
12318static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12319 unsigned Opcode, SelectionDAG &DAG) {
12320 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12321 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12322 // Match: dup (bitcast (extract_subv X, C)), LaneC
12323 if (BitCast.getOpcode() != ISD::BITCAST ||
12325 return false;
12326
12327 // The extract index must align in the destination type. That may not
12328 // happen if the bitcast is from narrow to wide type.
12329 SDValue Extract = BitCast.getOperand(0);
12330 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12331 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12332 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12333 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12334 if (ExtIdxInBits % CastedEltBitWidth != 0)
12335 return false;
12336
12337 // Can't handle cases where vector size is not 128-bit
12338 if (!Extract.getOperand(0).getValueType().is128BitVector())
12339 return false;
12340
12341 // Update the lane value by offsetting with the scaled extract index.
12342 LaneC += ExtIdxInBits / CastedEltBitWidth;
12343
12344 // Determine the casted vector type of the wide vector input.
12345 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12346 // Examples:
12347 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12348 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12349 unsigned SrcVecNumElts =
12350 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12352 SrcVecNumElts);
12353 return true;
12354 };
12355 MVT CastVT;
12356 if (getScaledOffsetDup(V, Lane, CastVT)) {
12357 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12358 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12359 V.getOperand(0).getValueType().is128BitVector()) {
12360 // The lane is incremented by the index of the extract.
12361 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12362 Lane += V.getConstantOperandVal(1);
12363 V = V.getOperand(0);
12364 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12365 // The lane is decremented if we are splatting from the 2nd operand.
12366 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12367 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12368 Lane -= Idx * VT.getVectorNumElements() / 2;
12369 V = WidenVector(V.getOperand(Idx), DAG);
12370 } else if (VT.getSizeInBits() == 64) {
12371 // Widen the operand to 128-bit register with undef.
12372 V = WidenVector(V, DAG);
12373 }
12374 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12375}
12376
12377// Return true if we can get a new shuffle mask by checking the parameter mask
12378// array to test whether every two adjacent mask values are continuous and
12379// starting from an even number.
12381 SmallVectorImpl<int> &NewMask) {
12382 unsigned NumElts = VT.getVectorNumElements();
12383 if (NumElts % 2 != 0)
12384 return false;
12385
12386 NewMask.clear();
12387 for (unsigned i = 0; i < NumElts; i += 2) {
12388 int M0 = M[i];
12389 int M1 = M[i + 1];
12390
12391 // If both elements are undef, new mask is undef too.
12392 if (M0 == -1 && M1 == -1) {
12393 NewMask.push_back(-1);
12394 continue;
12395 }
12396
12397 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12398 NewMask.push_back(M1 / 2);
12399 continue;
12400 }
12401
12402 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12403 NewMask.push_back(M0 / 2);
12404 continue;
12405 }
12406
12407 NewMask.clear();
12408 return false;
12409 }
12410
12411 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12412 return true;
12413}
12414
12415// Try to widen element type to get a new mask value for a better permutation
12416// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12417// UZP1/2, TRN1/2, REV, INS, etc.
12418// For example:
12419// shufflevector <4 x i32> %a, <4 x i32> %b,
12420// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12421// is equivalent to:
12422// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12423// Finally, we can get:
12424// mov v0.d[0], v1.d[1]
12426 SDLoc DL(Op);
12427 EVT VT = Op.getValueType();
12428 EVT ScalarVT = VT.getVectorElementType();
12429 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12430 SDValue V0 = Op.getOperand(0);
12431 SDValue V1 = Op.getOperand(1);
12432 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12433
12434 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12435 // We need to make sure the wider element type is legal. Thus, ElementSize
12436 // should be not larger than 32 bits, and i1 type should also be excluded.
12437 if (ElementSize > 32 || ElementSize == 1)
12438 return SDValue();
12439
12440 SmallVector<int, 8> NewMask;
12441 if (isWideTypeMask(Mask, VT, NewMask)) {
12442 MVT NewEltVT = VT.isFloatingPoint()
12443 ? MVT::getFloatingPointVT(ElementSize * 2)
12444 : MVT::getIntegerVT(ElementSize * 2);
12445 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12446 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12447 V0 = DAG.getBitcast(NewVT, V0);
12448 V1 = DAG.getBitcast(NewVT, V1);
12449 return DAG.getBitcast(VT,
12450 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12451 }
12452 }
12453
12454 return SDValue();
12455}
12456
12457// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12459 ArrayRef<int> ShuffleMask,
12460 SelectionDAG &DAG) {
12461 SDValue Tbl1 = Op->getOperand(0);
12462 SDValue Tbl2 = Op->getOperand(1);
12463 SDLoc dl(Op);
12464 SDValue Tbl2ID =
12465 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12466
12467 EVT VT = Op.getValueType();
12468 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12469 Tbl1->getOperand(0) != Tbl2ID ||
12471 Tbl2->getOperand(0) != Tbl2ID)
12472 return SDValue();
12473
12474 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12475 Tbl2->getValueType(0) != MVT::v16i8)
12476 return SDValue();
12477
12478 SDValue Mask1 = Tbl1->getOperand(3);
12479 SDValue Mask2 = Tbl2->getOperand(3);
12480 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12481 for (unsigned I = 0; I < 16; I++) {
12482 if (ShuffleMask[I] < 16)
12483 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12484 else {
12485 auto *C =
12486 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12487 if (!C)
12488 return SDValue();
12489 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12490 }
12491 }
12492
12493 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12494 SDValue ID =
12495 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12496
12497 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12498 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12499 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12500}
12501
12502// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12503// but we don't have an appropriate instruction,
12504// so custom-lower it as ZIP1-with-zeros.
12505SDValue
12506AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12507 SelectionDAG &DAG) const {
12508 SDLoc dl(Op);
12509 EVT VT = Op.getValueType();
12510 SDValue SrcOp = Op.getOperand(0);
12511 EVT SrcVT = SrcOp.getValueType();
12512 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12513 "Unexpected extension factor.");
12514 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12515 // FIXME: support multi-step zipping?
12516 if (Scale != 2)
12517 return SDValue();
12518 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12519 return DAG.getBitcast(VT,
12520 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12521}
12522
12523SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12524 SelectionDAG &DAG) const {
12525 SDLoc dl(Op);
12526 EVT VT = Op.getValueType();
12527
12528 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12529
12530 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12531 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12532
12533 // Convert shuffles that are directly supported on NEON to target-specific
12534 // DAG nodes, instead of keeping them as shuffles and matching them again
12535 // during code selection. This is more efficient and avoids the possibility
12536 // of inconsistencies between legalization and selection.
12537 ArrayRef<int> ShuffleMask = SVN->getMask();
12538
12539 SDValue V1 = Op.getOperand(0);
12540 SDValue V2 = Op.getOperand(1);
12541
12542 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12543 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12544 "Unexpected VECTOR_SHUFFLE mask size!");
12545
12546 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12547 return Res;
12548
12549 if (SVN->isSplat()) {
12550 int Lane = SVN->getSplatIndex();
12551 // If this is undef splat, generate it via "just" vdup, if possible.
12552 if (Lane == -1)
12553 Lane = 0;
12554
12555 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12556 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12557 V1.getOperand(0));
12558 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12559 // constant. If so, we can just reference the lane's definition directly.
12560 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12561 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12562 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12563
12564 // Otherwise, duplicate from the lane of the input vector.
12565 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12566 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12567 }
12568
12569 // Check if the mask matches a DUP for a wider element
12570 for (unsigned LaneSize : {64U, 32U, 16U}) {
12571 unsigned Lane = 0;
12572 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12573 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12574 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12576 // Cast V1 to an integer vector with required lane size
12577 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12578 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12579 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12580 V1 = DAG.getBitcast(NewVecTy, V1);
12581 // Constuct the DUP instruction
12582 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12583 // Cast back to the original type
12584 return DAG.getBitcast(VT, V1);
12585 }
12586 }
12587
12588 if (isREVMask(ShuffleMask, VT, 64))
12589 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12590 if (isREVMask(ShuffleMask, VT, 32))
12591 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12592 if (isREVMask(ShuffleMask, VT, 16))
12593 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12594
12595 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12596 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12597 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12598 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12599 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12600 DAG.getConstant(8, dl, MVT::i32));
12601 }
12602
12603 bool ReverseEXT = false;
12604 unsigned Imm;
12605 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12606 if (ReverseEXT)
12607 std::swap(V1, V2);
12608 Imm *= getExtFactor(V1);
12609 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12610 DAG.getConstant(Imm, dl, MVT::i32));
12611 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12612 Imm *= getExtFactor(V1);
12613 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12614 DAG.getConstant(Imm, dl, MVT::i32));
12615 }
12616
12617 unsigned WhichResult;
12618 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12619 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12620 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12621 }
12622 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12623 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12624 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12625 }
12626 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12627 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12628 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12629 }
12630
12631 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12632 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12633 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12634 }
12635 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12636 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12637 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12638 }
12639 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12640 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12641 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12642 }
12643
12645 return Concat;
12646
12647 bool DstIsLeft;
12648 int Anomaly;
12649 int NumInputElements = V1.getValueType().getVectorNumElements();
12650 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12651 SDValue DstVec = DstIsLeft ? V1 : V2;
12652 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12653
12654 SDValue SrcVec = V1;
12655 int SrcLane = ShuffleMask[Anomaly];
12656 if (SrcLane >= NumInputElements) {
12657 SrcVec = V2;
12658 SrcLane -= VT.getVectorNumElements();
12659 }
12660 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12661
12662 EVT ScalarVT = VT.getVectorElementType();
12663
12664 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12665 ScalarVT = MVT::i32;
12666
12667 return DAG.getNode(
12668 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12669 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12670 DstLaneV);
12671 }
12672
12673 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12674 return NewSD;
12675
12676 // If the shuffle is not directly supported and it has 4 elements, use
12677 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12678 unsigned NumElts = VT.getVectorNumElements();
12679 if (NumElts == 4) {
12680 unsigned PFIndexes[4];
12681 for (unsigned i = 0; i != 4; ++i) {
12682 if (ShuffleMask[i] < 0)
12683 PFIndexes[i] = 8;
12684 else
12685 PFIndexes[i] = ShuffleMask[i];
12686 }
12687
12688 // Compute the index in the perfect shuffle table.
12689 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12690 PFIndexes[2] * 9 + PFIndexes[3];
12691 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12692 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12693 dl);
12694 }
12695
12696 return GenerateTBL(Op, ShuffleMask, DAG);
12697}
12698
12699SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12700 SelectionDAG &DAG) const {
12701 EVT VT = Op.getValueType();
12702
12703 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12704 return LowerToScalableOp(Op, DAG);
12705
12706 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12707 "Unexpected vector type!");
12708
12709 // We can handle the constant cases during isel.
12710 if (isa<ConstantSDNode>(Op.getOperand(0)))
12711 return Op;
12712
12713 // There isn't a natural way to handle the general i1 case, so we use some
12714 // trickery with whilelo.
12715 SDLoc DL(Op);
12716 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12717 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12718 DAG.getValueType(MVT::i1));
12719 SDValue ID =
12720 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12721 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12722 if (VT == MVT::nxv1i1)
12723 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12724 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12725 Zero, SplatVal),
12726 Zero);
12727 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12728}
12729
12730SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12731 SelectionDAG &DAG) const {
12732 SDLoc DL(Op);
12733
12734 EVT VT = Op.getValueType();
12735 if (!isTypeLegal(VT) || !VT.isScalableVector())
12736 return SDValue();
12737
12738 // Current lowering only supports the SVE-ACLE types.
12740 return SDValue();
12741
12742 // The DUPQ operation is indepedent of element type so normalise to i64s.
12743 SDValue Idx128 = Op.getOperand(2);
12744
12745 // DUPQ can be used when idx is in range.
12746 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12747 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12748 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12749 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12750 }
12751
12752 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12753
12754 // The ACLE says this must produce the same result as:
12755 // svtbl(data, svadd_x(svptrue_b64(),
12756 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12757 // index * 2))
12758 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12759 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12760
12761 // create the vector 0,1,0,1,...
12762 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12763 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12764
12765 // create the vector idx64,idx64+1,idx64,idx64+1,...
12766 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12767 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12768 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12769
12770 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12771 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12772 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12773}
12774
12775
12776static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12777 APInt &UndefBits) {
12778 EVT VT = BVN->getValueType(0);
12779 APInt SplatBits, SplatUndef;
12780 unsigned SplatBitSize;
12781 bool HasAnyUndefs;
12782 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12783 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12784
12785 for (unsigned i = 0; i < NumSplats; ++i) {
12786 CnstBits <<= SplatBitSize;
12787 UndefBits <<= SplatBitSize;
12788 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12789 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12790 }
12791
12792 return true;
12793 }
12794
12795 return false;
12796}
12797
12798// Try 64-bit splatted SIMD immediate.
12799static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12800 const APInt &Bits) {
12801 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12802 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12803 EVT VT = Op.getValueType();
12804 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12805
12808
12809 SDLoc dl(Op);
12810 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12811 DAG.getConstant(Value, dl, MVT::i32));
12812 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12813 }
12814 }
12815
12816 return SDValue();
12817}
12818
12819// Try 32-bit splatted SIMD immediate.
12820static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12821 const APInt &Bits,
12822 const SDValue *LHS = nullptr) {
12823 EVT VT = Op.getValueType();
12824 if (VT.isFixedLengthVector() &&
12826 return SDValue();
12827
12828 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12829 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12830 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12831 bool isAdvSIMDModImm = false;
12832 uint64_t Shift;
12833
12834 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12836 Shift = 0;
12837 }
12838 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12840 Shift = 8;
12841 }
12842 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12844 Shift = 16;
12845 }
12846 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12848 Shift = 24;
12849 }
12850
12851 if (isAdvSIMDModImm) {
12852 SDLoc dl(Op);
12853 SDValue Mov;
12854
12855 if (LHS)
12856 Mov = DAG.getNode(NewOp, dl, MovTy,
12857 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12858 DAG.getConstant(Value, dl, MVT::i32),
12859 DAG.getConstant(Shift, dl, MVT::i32));
12860 else
12861 Mov = DAG.getNode(NewOp, dl, MovTy,
12862 DAG.getConstant(Value, dl, MVT::i32),
12863 DAG.getConstant(Shift, dl, MVT::i32));
12864
12865 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12866 }
12867 }
12868
12869 return SDValue();
12870}
12871
12872// Try 16-bit splatted SIMD immediate.
12873static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12874 const APInt &Bits,
12875 const SDValue *LHS = nullptr) {
12876 EVT VT = Op.getValueType();
12877 if (VT.isFixedLengthVector() &&
12879 return SDValue();
12880
12881 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12882 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12883 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12884 bool isAdvSIMDModImm = false;
12885 uint64_t Shift;
12886
12887 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12889 Shift = 0;
12890 }
12891 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12893 Shift = 8;
12894 }
12895
12896 if (isAdvSIMDModImm) {
12897 SDLoc dl(Op);
12898 SDValue Mov;
12899
12900 if (LHS)
12901 Mov = DAG.getNode(NewOp, dl, MovTy,
12902 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12903 DAG.getConstant(Value, dl, MVT::i32),
12904 DAG.getConstant(Shift, dl, MVT::i32));
12905 else
12906 Mov = DAG.getNode(NewOp, dl, MovTy,
12907 DAG.getConstant(Value, dl, MVT::i32),
12908 DAG.getConstant(Shift, dl, MVT::i32));
12909
12910 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12911 }
12912 }
12913
12914 return SDValue();
12915}
12916
12917// Try 32-bit splatted SIMD immediate with shifted ones.
12919 SelectionDAG &DAG, const APInt &Bits) {
12920 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12921 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12922 EVT VT = Op.getValueType();
12923 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12924 bool isAdvSIMDModImm = false;
12925 uint64_t Shift;
12926
12927 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12929 Shift = 264;
12930 }
12931 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12933 Shift = 272;
12934 }
12935
12936 if (isAdvSIMDModImm) {
12937 SDLoc dl(Op);
12938 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12939 DAG.getConstant(Value, dl, MVT::i32),
12940 DAG.getConstant(Shift, dl, MVT::i32));
12941 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12942 }
12943 }
12944
12945 return SDValue();
12946}
12947
12948// Try 8-bit splatted SIMD immediate.
12949static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12950 const APInt &Bits) {
12951 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12952 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12953 EVT VT = Op.getValueType();
12954 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12955
12958
12959 SDLoc dl(Op);
12960 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12961 DAG.getConstant(Value, dl, MVT::i32));
12962 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12963 }
12964 }
12965
12966 return SDValue();
12967}
12968
12969// Try FP splatted SIMD immediate.
12970static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12971 const APInt &Bits) {
12972 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12973 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12974 EVT VT = Op.getValueType();
12975 bool isWide = (VT.getSizeInBits() == 128);
12976 MVT MovTy;
12977 bool isAdvSIMDModImm = false;
12978
12979 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12981 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12982 }
12983 else if (isWide &&
12984 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12986 MovTy = MVT::v2f64;
12987 }
12988
12989 if (isAdvSIMDModImm) {
12990 SDLoc dl(Op);
12991 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12992 DAG.getConstant(Value, dl, MVT::i32));
12993 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12994 }
12995 }
12996
12997 return SDValue();
12998}
12999
13000// Specialized code to quickly find if PotentialBVec is a BuildVector that
13001// consists of only the same constant int value, returned in reference arg
13002// ConstVal
13003static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13004 uint64_t &ConstVal) {
13005 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
13006 if (!Bvec)
13007 return false;
13008 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
13009 if (!FirstElt)
13010 return false;
13011 EVT VT = Bvec->getValueType(0);
13012 unsigned NumElts = VT.getVectorNumElements();
13013 for (unsigned i = 1; i < NumElts; ++i)
13014 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13015 return false;
13016 ConstVal = FirstElt->getZExtValue();
13017 return true;
13018}
13019
13021 // Look through cast.
13022 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13023 N = N.getOperand(0);
13024
13025 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13026}
13027
13029 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13030
13031 // Look through cast.
13032 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13033 N = N.getOperand(0);
13034 // When reinterpreting from a type with fewer elements the "new" elements
13035 // are not active, so bail if they're likely to be used.
13036 if (N.getValueType().getVectorMinNumElements() < NumElts)
13037 return false;
13038 }
13039
13040 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13041 return true;
13042
13043 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13044 // or smaller than the implicit element type represented by N.
13045 // NOTE: A larger element count implies a smaller element type.
13046 if (N.getOpcode() == AArch64ISD::PTRUE &&
13047 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13048 return N.getValueType().getVectorMinNumElements() >= NumElts;
13049
13050 // If we're compiling for a specific vector-length, we can check if the
13051 // pattern's VL equals that of the scalable vector at runtime.
13052 if (N.getOpcode() == AArch64ISD::PTRUE) {
13053 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13054 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13055 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13056 if (MaxSVESize && MinSVESize == MaxSVESize) {
13057 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13058 unsigned PatNumElts =
13059 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13060 return PatNumElts == (NumElts * VScale);
13061 }
13062 }
13063
13064 return false;
13065}
13066
13067// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13068// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13069// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13070// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13071// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13072// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13074 EVT VT = N->getValueType(0);
13075
13076 if (!VT.isVector())
13077 return SDValue();
13078
13079 SDLoc DL(N);
13080
13081 SDValue And;
13082 SDValue Shift;
13083
13084 SDValue FirstOp = N->getOperand(0);
13085 unsigned FirstOpc = FirstOp.getOpcode();
13086 SDValue SecondOp = N->getOperand(1);
13087 unsigned SecondOpc = SecondOp.getOpcode();
13088
13089 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13090 // a BICi in order to use an immediate instead of a register.
13091 // Is the other operand an shl or lshr? This will have been turned into:
13092 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13093 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13094 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13095 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13096 SecondOpc == AArch64ISD::SHL_PRED ||
13097 SecondOpc == AArch64ISD::SRL_PRED)) {
13098 And = FirstOp;
13099 Shift = SecondOp;
13100
13101 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13102 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13103 FirstOpc == AArch64ISD::SHL_PRED ||
13104 FirstOpc == AArch64ISD::SRL_PRED)) {
13105 And = SecondOp;
13106 Shift = FirstOp;
13107 } else
13108 return SDValue();
13109
13110 bool IsAnd = And.getOpcode() == ISD::AND;
13111 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13113 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13115
13116 // Is the shift amount constant and are all lanes active?
13117 uint64_t C2;
13118 if (ShiftHasPredOp) {
13119 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13120 return SDValue();
13121 APInt C;
13123 return SDValue();
13124 C2 = C.getZExtValue();
13125 } else if (ConstantSDNode *C2node =
13126 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13127 C2 = C2node->getZExtValue();
13128 else
13129 return SDValue();
13130
13131 APInt C1AsAPInt;
13132 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13133 if (IsAnd) {
13134 // Is the and mask vector all constant?
13135 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13136 return SDValue();
13137 } else {
13138 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13139 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13140 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13141 assert(C1nodeImm && C1nodeShift);
13142 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13143 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13144 }
13145
13146 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13147 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13148 // how much one can shift elements of a particular size?
13149 if (C2 > ElemSizeInBits)
13150 return SDValue();
13151
13152 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13153 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13154 if (C1AsAPInt != RequiredC1)
13155 return SDValue();
13156
13157 SDValue X = And.getOperand(0);
13158 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13159 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13160 : Shift.getOperand(1);
13161
13162 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13163 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13164
13165 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13166 LLVM_DEBUG(N->dump(&DAG));
13167 LLVM_DEBUG(dbgs() << "into: \n");
13168 LLVM_DEBUG(ResultSLI->dump(&DAG));
13169
13170 ++NumShiftInserts;
13171 return ResultSLI;
13172}
13173
13174SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13175 SelectionDAG &DAG) const {
13176 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13177 !Subtarget->isNeonAvailable()))
13178 return LowerToScalableOp(Op, DAG);
13179
13180 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13181 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13182 return Res;
13183
13184 EVT VT = Op.getValueType();
13185 if (VT.isScalableVector())
13186 return Op;
13187
13188 SDValue LHS = Op.getOperand(0);
13189 BuildVectorSDNode *BVN =
13190 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13191 if (!BVN) {
13192 // OR commutes, so try swapping the operands.
13193 LHS = Op.getOperand(1);
13194 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13195 }
13196 if (!BVN)
13197 return Op;
13198
13199 APInt DefBits(VT.getSizeInBits(), 0);
13200 APInt UndefBits(VT.getSizeInBits(), 0);
13201 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13202 SDValue NewOp;
13203
13204 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13205 DefBits, &LHS)) ||
13206 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13207 DefBits, &LHS)))
13208 return NewOp;
13209
13210 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13211 UndefBits, &LHS)) ||
13212 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13213 UndefBits, &LHS)))
13214 return NewOp;
13215 }
13216
13217 // We can always fall back to a non-immediate OR.
13218 return Op;
13219}
13220
13221// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13222// be truncated to fit element width.
13224 SelectionDAG &DAG) {
13225 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13226 SDLoc dl(Op);
13227 EVT VT = Op.getValueType();
13228 EVT EltTy= VT.getVectorElementType();
13229
13230 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13231 return Op;
13232
13234 for (SDValue Lane : Op->ops()) {
13235 // For integer vectors, type legalization would have promoted the
13236 // operands already. Otherwise, if Op is a floating-point splat
13237 // (with operands cast to integers), then the only possibilities
13238 // are constants and UNDEFs.
13239 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13240 APInt LowBits(EltTy.getSizeInBits(),
13241 CstLane->getZExtValue());
13242 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13243 } else if (Lane.getNode()->isUndef()) {
13244 Lane = DAG.getUNDEF(MVT::i32);
13245 } else {
13246 assert(Lane.getValueType() == MVT::i32 &&
13247 "Unexpected BUILD_VECTOR operand type");
13248 }
13249 Ops.push_back(Lane);
13250 }
13251 return DAG.getBuildVector(VT, dl, Ops);
13252}
13253
13255 const AArch64Subtarget *ST) {
13256 EVT VT = Op.getValueType();
13257 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13258 "Expected a legal NEON vector");
13259
13260 APInt DefBits(VT.getSizeInBits(), 0);
13261 APInt UndefBits(VT.getSizeInBits(), 0);
13262 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13263 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13264 auto TryMOVIWithBits = [&](APInt DefBits) {
13265 SDValue NewOp;
13266 if ((NewOp =
13267 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13268 (NewOp =
13269 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13270 (NewOp =
13271 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13272 (NewOp =
13273 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13274 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13275 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13276 return NewOp;
13277
13278 APInt NotDefBits = ~DefBits;
13279 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13280 NotDefBits)) ||
13282 NotDefBits)) ||
13283 (NewOp =
13284 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13285 return NewOp;
13286 return SDValue();
13287 };
13288 if (SDValue R = TryMOVIWithBits(DefBits))
13289 return R;
13290 if (SDValue R = TryMOVIWithBits(UndefBits))
13291 return R;
13292
13293 // See if a fneg of the constant can be materialized with a MOVI, etc
13294 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13295 // FNegate each sub-element of the constant
13296 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13297 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13298 .zext(VT.getSizeInBits());
13299 APInt NegBits(VT.getSizeInBits(), 0);
13300 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13301 for (unsigned i = 0; i < NumElts; i++)
13302 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13303 NegBits = DefBits ^ NegBits;
13304
13305 // Try to create the new constants with MOVI, and if so generate a fneg
13306 // for it.
13307 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13308 SDLoc DL(Op);
13309 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13310 return DAG.getNode(
13312 DAG.getNode(ISD::FNEG, DL, VFVT,
13313 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13314 }
13315 return SDValue();
13316 };
13317 SDValue R;
13318 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13319 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13320 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13321 return R;
13322 }
13323
13324 return SDValue();
13325}
13326
13327SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13328 SelectionDAG &DAG) const {
13329 EVT VT = Op.getValueType();
13330
13331 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13332 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13333 SDLoc DL(Op);
13334 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13335 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13336 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13337 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13338 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13339 }
13340
13341 // Revert to common legalisation for all other variants.
13342 return SDValue();
13343 }
13344
13345 // Try to build a simple constant vector.
13346 Op = NormalizeBuildVector(Op, DAG);
13347 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13348 // abort.
13349 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13350 return SDValue();
13351
13352 // Certain vector constants, used to express things like logical NOT and
13353 // arithmetic NEG, are passed through unmodified. This allows special
13354 // patterns for these operations to match, which will lower these constants
13355 // to whatever is proven necessary.
13356 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13357 if (BVN->isConstant()) {
13358 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13359 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13360 APInt Val(BitSize,
13361 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13362 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13363 return Op;
13364 }
13365 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13366 if (Const->isZero() && !Const->isNegative())
13367 return Op;
13368 }
13369
13370 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13371 return V;
13372
13373 // Scan through the operands to find some interesting properties we can
13374 // exploit:
13375 // 1) If only one value is used, we can use a DUP, or
13376 // 2) if only the low element is not undef, we can just insert that, or
13377 // 3) if only one constant value is used (w/ some non-constant lanes),
13378 // we can splat the constant value into the whole vector then fill
13379 // in the non-constant lanes.
13380 // 4) FIXME: If different constant values are used, but we can intelligently
13381 // select the values we'll be overwriting for the non-constant
13382 // lanes such that we can directly materialize the vector
13383 // some other way (MOVI, e.g.), we can be sneaky.
13384 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13385 SDLoc dl(Op);
13386 unsigned NumElts = VT.getVectorNumElements();
13387 bool isOnlyLowElement = true;
13388 bool usesOnlyOneValue = true;
13389 bool usesOnlyOneConstantValue = true;
13390 bool isConstant = true;
13391 bool AllLanesExtractElt = true;
13392 unsigned NumConstantLanes = 0;
13393 unsigned NumDifferentLanes = 0;
13394 unsigned NumUndefLanes = 0;
13395 SDValue Value;
13396 SDValue ConstantValue;
13397 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13398 unsigned ConsecutiveValCount = 0;
13399 SDValue PrevVal;
13400 for (unsigned i = 0; i < NumElts; ++i) {
13401 SDValue V = Op.getOperand(i);
13402 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13403 AllLanesExtractElt = false;
13404 if (V.isUndef()) {
13405 ++NumUndefLanes;
13406 continue;
13407 }
13408 if (i > 0)
13409 isOnlyLowElement = false;
13410 if (!isIntOrFPConstant(V))
13411 isConstant = false;
13412
13413 if (isIntOrFPConstant(V)) {
13414 ++NumConstantLanes;
13415 if (!ConstantValue.getNode())
13416 ConstantValue = V;
13417 else if (ConstantValue != V)
13418 usesOnlyOneConstantValue = false;
13419 }
13420
13421 if (!Value.getNode())
13422 Value = V;
13423 else if (V != Value) {
13424 usesOnlyOneValue = false;
13425 ++NumDifferentLanes;
13426 }
13427
13428 if (PrevVal != V) {
13429 ConsecutiveValCount = 0;
13430 PrevVal = V;
13431 }
13432
13433 // Keep different values and its last consecutive count. For example,
13434 //
13435 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13436 // t24, t24, t24, t24, t24, t24, t24, t24
13437 // t23 = consecutive count 8
13438 // t24 = consecutive count 8
13439 // ------------------------------------------------------------------
13440 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13441 // t24, t24, t24, t24, t24, t24, t24, t24
13442 // t23 = consecutive count 5
13443 // t24 = consecutive count 9
13444 DifferentValueMap[V] = ++ConsecutiveValCount;
13445 }
13446
13447 if (!Value.getNode()) {
13448 LLVM_DEBUG(
13449 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13450 return DAG.getUNDEF(VT);
13451 }
13452
13453 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13454 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13455 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13456 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13457 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13458 "SCALAR_TO_VECTOR node\n");
13459 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13460 }
13461
13462 if (AllLanesExtractElt) {
13463 SDNode *Vector = nullptr;
13464 bool Even = false;
13465 bool Odd = false;
13466 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13467 // the Odd pattern <1,3,5,...>.
13468 for (unsigned i = 0; i < NumElts; ++i) {
13469 SDValue V = Op.getOperand(i);
13470 const SDNode *N = V.getNode();
13471 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13472 Even = false;
13473 Odd = false;
13474 break;
13475 }
13476 SDValue N0 = N->getOperand(0);
13477
13478 // All elements are extracted from the same vector.
13479 if (!Vector) {
13480 Vector = N0.getNode();
13481 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13482 // BUILD_VECTOR.
13483 if (VT.getVectorElementType() !=
13485 break;
13486 } else if (Vector != N0.getNode()) {
13487 Odd = false;
13488 Even = false;
13489 break;
13490 }
13491
13492 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13493 // indices <1,3,5,...>.
13494 uint64_t Val = N->getConstantOperandVal(1);
13495 if (Val == 2 * i) {
13496 Even = true;
13497 continue;
13498 }
13499 if (Val - 1 == 2 * i) {
13500 Odd = true;
13501 continue;
13502 }
13503
13504 // Something does not match: abort.
13505 Odd = false;
13506 Even = false;
13507 break;
13508 }
13509 if (Even || Odd) {
13510 SDValue LHS =
13512 DAG.getConstant(0, dl, MVT::i64));
13513 SDValue RHS =
13515 DAG.getConstant(NumElts, dl, MVT::i64));
13516
13517 if (Even && !Odd)
13518 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13519 RHS);
13520 if (Odd && !Even)
13521 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13522 RHS);
13523 }
13524 }
13525
13526 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13527 // i32 and try again.
13528 if (usesOnlyOneValue) {
13529 if (!isConstant) {
13530 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13531 Value.getValueType() != VT) {
13532 LLVM_DEBUG(
13533 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13534 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13535 }
13536
13537 // This is actually a DUPLANExx operation, which keeps everything vectory.
13538
13539 SDValue Lane = Value.getOperand(1);
13540 Value = Value.getOperand(0);
13541 if (Value.getValueSizeInBits() == 64) {
13542 LLVM_DEBUG(
13543 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13544 "widening it\n");
13545 Value = WidenVector(Value, DAG);
13546 }
13547
13548 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13549 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13550 }
13551
13554 EVT EltTy = VT.getVectorElementType();
13555 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13556 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13557 LLVM_DEBUG(
13558 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13559 "BITCASTS, and try again\n");
13560 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13561 for (unsigned i = 0; i < NumElts; ++i)
13562 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13563 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13564 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13565 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13566 Val.dump(););
13567 Val = LowerBUILD_VECTOR(Val, DAG);
13568 if (Val.getNode())
13569 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13570 }
13571 }
13572
13573 // If we need to insert a small number of different non-constant elements and
13574 // the vector width is sufficiently large, prefer using DUP with the common
13575 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13576 // skip the constant lane handling below.
13577 bool PreferDUPAndInsert =
13578 !isConstant && NumDifferentLanes >= 1 &&
13579 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13580 NumDifferentLanes >= NumConstantLanes;
13581
13582 // If there was only one constant value used and for more than one lane,
13583 // start by splatting that value, then replace the non-constant lanes. This
13584 // is better than the default, which will perform a separate initialization
13585 // for each lane.
13586 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13587 // Firstly, try to materialize the splat constant.
13588 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13589 unsigned BitSize = VT.getScalarSizeInBits();
13590 APInt ConstantValueAPInt(1, 0);
13591 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13592 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13593 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13594 !ConstantValueAPInt.isAllOnes()) {
13595 Val = ConstantBuildVector(Val, DAG, Subtarget);
13596 if (!Val)
13597 // Otherwise, materialize the constant and splat it.
13598 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13599 }
13600
13601 // Now insert the non-constant lanes.
13602 for (unsigned i = 0; i < NumElts; ++i) {
13603 SDValue V = Op.getOperand(i);
13604 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13605 if (!isIntOrFPConstant(V))
13606 // Note that type legalization likely mucked about with the VT of the
13607 // source operand, so we may have to convert it here before inserting.
13608 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13609 }
13610 return Val;
13611 }
13612
13613 // This will generate a load from the constant pool.
13614 if (isConstant) {
13615 LLVM_DEBUG(
13616 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13617 "expansion\n");
13618 return SDValue();
13619 }
13620
13621 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13622 // v4i32s. This is really a truncate, which we can construct out of (legal)
13623 // concats and truncate nodes.
13625 return M;
13626
13627 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13628 if (NumElts >= 4) {
13629 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13630 return Shuffle;
13631
13632 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13633 return Shuffle;
13634 }
13635
13636 if (PreferDUPAndInsert) {
13637 // First, build a constant vector with the common element.
13638 SmallVector<SDValue, 8> Ops(NumElts, Value);
13639 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13640 // Next, insert the elements that do not match the common value.
13641 for (unsigned I = 0; I < NumElts; ++I)
13642 if (Op.getOperand(I) != Value)
13643 NewVector =
13644 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13645 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13646
13647 return NewVector;
13648 }
13649
13650 // If vector consists of two different values, try to generate two DUPs and
13651 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13652 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13654 // Check the consecutive count of the value is the half number of vector
13655 // elements. In this case, we can use CONCAT_VECTORS. For example,
13656 //
13657 // canUseVECTOR_CONCAT = true;
13658 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13659 // t24, t24, t24, t24, t24, t24, t24, t24
13660 //
13661 // canUseVECTOR_CONCAT = false;
13662 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13663 // t24, t24, t24, t24, t24, t24, t24, t24
13664 bool canUseVECTOR_CONCAT = true;
13665 for (auto Pair : DifferentValueMap) {
13666 // Check different values have same length which is NumElts / 2.
13667 if (Pair.second != NumElts / 2)
13668 canUseVECTOR_CONCAT = false;
13669 Vals.push_back(Pair.first);
13670 }
13671
13672 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13673 // CONCAT_VECTORs. For example,
13674 //
13675 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13676 // t24, t24, t24, t24, t24, t24, t24, t24
13677 // ==>
13678 // t26: v8i8 = AArch64ISD::DUP t23
13679 // t28: v8i8 = AArch64ISD::DUP t24
13680 // t29: v16i8 = concat_vectors t26, t28
13681 if (canUseVECTOR_CONCAT) {
13682 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13683 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13684 SubVT.getVectorNumElements() >= 2) {
13685 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13686 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13687 SDValue DUP1 =
13688 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13689 SDValue DUP2 =
13690 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13692 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13693 return CONCAT_VECTORS;
13694 }
13695 }
13696
13697 // Let's try to generate VECTOR_SHUFFLE. For example,
13698 //
13699 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13700 // ==>
13701 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13702 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13703 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13704 if (NumElts >= 8) {
13705 SmallVector<int, 16> MaskVec;
13706 // Build mask for VECTOR_SHUFLLE.
13707 SDValue FirstLaneVal = Op.getOperand(0);
13708 for (unsigned i = 0; i < NumElts; ++i) {
13709 SDValue Val = Op.getOperand(i);
13710 if (FirstLaneVal == Val)
13711 MaskVec.push_back(i);
13712 else
13713 MaskVec.push_back(i + NumElts);
13714 }
13715
13716 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13717 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13718 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13719 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13721 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13722 return VECTOR_SHUFFLE;
13723 }
13724 }
13725
13726 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13727 // know the default expansion would otherwise fall back on something even
13728 // worse. For a vector with one or two non-undef values, that's
13729 // scalar_to_vector for the elements followed by a shuffle (provided the
13730 // shuffle is valid for the target) and materialization element by element
13731 // on the stack followed by a load for everything else.
13732 if (!isConstant && !usesOnlyOneValue) {
13733 LLVM_DEBUG(
13734 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13735 "of INSERT_VECTOR_ELT\n");
13736
13737 SDValue Vec = DAG.getUNDEF(VT);
13738 SDValue Op0 = Op.getOperand(0);
13739 unsigned i = 0;
13740
13741 // Use SCALAR_TO_VECTOR for lane zero to
13742 // a) Avoid a RMW dependency on the full vector register, and
13743 // b) Allow the register coalescer to fold away the copy if the
13744 // value is already in an S or D register, and we're forced to emit an
13745 // INSERT_SUBREG that we can't fold anywhere.
13746 //
13747 // We also allow types like i8 and i16 which are illegal scalar but legal
13748 // vector element types. After type-legalization the inserted value is
13749 // extended (i32) and it is safe to cast them to the vector type by ignoring
13750 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13751 if (!Op0.isUndef()) {
13752 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13753 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13754 ++i;
13755 }
13756 LLVM_DEBUG(if (i < NumElts) dbgs()
13757 << "Creating nodes for the other vector elements:\n";);
13758 for (; i < NumElts; ++i) {
13759 SDValue V = Op.getOperand(i);
13760 if (V.isUndef())
13761 continue;
13762 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13763 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13764 }
13765 return Vec;
13766 }
13767
13768 LLVM_DEBUG(
13769 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13770 "better alternative\n");
13771 return SDValue();
13772}
13773
13774SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13775 SelectionDAG &DAG) const {
13776 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13777 !Subtarget->isNeonAvailable()))
13778 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13779
13780 assert(Op.getValueType().isScalableVector() &&
13781 isTypeLegal(Op.getValueType()) &&
13782 "Expected legal scalable vector type!");
13783
13784 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13785 unsigned NumOperands = Op->getNumOperands();
13786 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13787 "Unexpected number of operands in CONCAT_VECTORS");
13788
13789 if (NumOperands == 2)
13790 return Op;
13791
13792 // Concat each pair of subvectors and pack into the lower half of the array.
13793 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13794 while (ConcatOps.size() > 1) {
13795 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13796 SDValue V1 = ConcatOps[I];
13797 SDValue V2 = ConcatOps[I + 1];
13798 EVT SubVT = V1.getValueType();
13799 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13800 ConcatOps[I / 2] =
13801 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13802 }
13803 ConcatOps.resize(ConcatOps.size() / 2);
13804 }
13805 return ConcatOps[0];
13806 }
13807
13808 return SDValue();
13809}
13810
13811SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13812 SelectionDAG &DAG) const {
13813 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13814
13815 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13816 !Subtarget->isNeonAvailable()))
13817 return LowerFixedLengthInsertVectorElt(Op, DAG);
13818
13819 EVT VT = Op.getOperand(0).getValueType();
13820
13821 if (VT.getScalarType() == MVT::i1) {
13822 EVT VectorVT = getPromotedVTForPredicate(VT);
13823 SDLoc DL(Op);
13824 SDValue ExtendedVector =
13825 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13826 SDValue ExtendedValue =
13827 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13828 VectorVT.getScalarType().getSizeInBits() < 32
13829 ? MVT::i32
13830 : VectorVT.getScalarType());
13831 ExtendedVector =
13832 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13833 ExtendedValue, Op.getOperand(2));
13834 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13835 }
13836
13837 // Check for non-constant or out of range lane.
13838 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13839 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13840 return SDValue();
13841
13842 return Op;
13843}
13844
13845SDValue
13846AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13847 SelectionDAG &DAG) const {
13848 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13849 EVT VT = Op.getOperand(0).getValueType();
13850
13851 if (VT.getScalarType() == MVT::i1) {
13852 // We can't directly extract from an SVE predicate; extend it first.
13853 // (This isn't the only possible lowering, but it's straightforward.)
13854 EVT VectorVT = getPromotedVTForPredicate(VT);
13855 SDLoc DL(Op);
13856 SDValue Extend =
13857 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13858 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13859 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13860 Extend, Op.getOperand(1));
13861 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13862 }
13863
13864 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13865 return LowerFixedLengthExtractVectorElt(Op, DAG);
13866
13867 // Check for non-constant or out of range lane.
13868 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13869 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13870 return SDValue();
13871
13872 // Insertion/extraction are legal for V128 types.
13873 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13874 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13875 VT == MVT::v8f16 || VT == MVT::v8bf16)
13876 return Op;
13877
13878 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13879 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13880 VT != MVT::v4bf16)
13881 return SDValue();
13882
13883 // For V64 types, we perform extraction by expanding the value
13884 // to a V128 type and perform the extraction on that.
13885 SDLoc DL(Op);
13886 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13887 EVT WideTy = WideVec.getValueType();
13888
13889 EVT ExtrTy = WideTy.getVectorElementType();
13890 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13891 ExtrTy = MVT::i32;
13892
13893 // For extractions, we just return the result directly.
13894 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13895 Op.getOperand(1));
13896}
13897
13898SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13899 SelectionDAG &DAG) const {
13900 assert(Op.getValueType().isFixedLengthVector() &&
13901 "Only cases that extract a fixed length vector are supported!");
13902
13903 EVT InVT = Op.getOperand(0).getValueType();
13904 unsigned Idx = Op.getConstantOperandVal(1);
13905 unsigned Size = Op.getValueSizeInBits();
13906
13907 // If we don't have legal types yet, do nothing
13908 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13909 return SDValue();
13910
13911 if (InVT.isScalableVector()) {
13912 // This will be matched by custom code during ISelDAGToDAG.
13913 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13914 return Op;
13915
13916 return SDValue();
13917 }
13918
13919 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13920 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13921 return Op;
13922
13923 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13924 // that directly.
13925 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13926 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13927 return Op;
13928
13929 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13930 SDLoc DL(Op);
13931
13932 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13933 SDValue NewInVec =
13934 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13935
13936 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13937 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13938 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13939 }
13940
13941 return SDValue();
13942}
13943
13944SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13945 SelectionDAG &DAG) const {
13946 assert(Op.getValueType().isScalableVector() &&
13947 "Only expect to lower inserts into scalable vectors!");
13948
13949 EVT InVT = Op.getOperand(1).getValueType();
13950 unsigned Idx = Op.getConstantOperandVal(2);
13951
13952 SDValue Vec0 = Op.getOperand(0);
13953 SDValue Vec1 = Op.getOperand(1);
13954 SDLoc DL(Op);
13955 EVT VT = Op.getValueType();
13956
13957 if (InVT.isScalableVector()) {
13958 if (!isTypeLegal(VT))
13959 return SDValue();
13960
13961 // Break down insert_subvector into simpler parts.
13962 if (VT.getVectorElementType() == MVT::i1) {
13963 unsigned NumElts = VT.getVectorMinNumElements();
13964 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13965
13966 SDValue Lo, Hi;
13967 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13968 DAG.getVectorIdxConstant(0, DL));
13969 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13970 DAG.getVectorIdxConstant(NumElts / 2, DL));
13971 if (Idx < (NumElts / 2))
13972 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13974 else
13975 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13976 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13977
13978 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13979 }
13980
13981 // Ensure the subvector is half the size of the main vector.
13982 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13983 return SDValue();
13984
13985 // Here narrow and wide refers to the vector element types. After "casting"
13986 // both vectors must have the same bit length and so because the subvector
13987 // has fewer elements, those elements need to be bigger.
13990
13991 // NOP cast operands to the largest legal vector of the same element count.
13992 if (VT.isFloatingPoint()) {
13993 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13994 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13995 } else {
13996 // Legal integer vectors are already their largest so Vec0 is fine as is.
13997 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13998 }
13999
14000 // To replace the top/bottom half of vector V with vector SubV we widen the
14001 // preserved half of V, concatenate this to SubV (the order depending on the
14002 // half being replaced) and then narrow the result.
14003 SDValue Narrow;
14004 if (Idx == 0) {
14005 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
14006 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
14007 } else {
14009 "Invalid subvector index!");
14010 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
14011 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14012 }
14013
14014 return getSVESafeBitCast(VT, Narrow, DAG);
14015 }
14016
14017 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14018 // This will be matched by custom code during ISelDAGToDAG.
14019 if (Vec0.isUndef())
14020 return Op;
14021
14022 std::optional<unsigned> PredPattern =
14024 auto PredTy = VT.changeVectorElementType(MVT::i1);
14025 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14026 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14027 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14028 }
14029
14030 return SDValue();
14031}
14032
14033static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14034 if (Op.getOpcode() != AArch64ISD::DUP &&
14035 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14036 Op.getOpcode() != ISD::BUILD_VECTOR)
14037 return false;
14038
14039 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14040 !isAllConstantBuildVector(Op, SplatVal))
14041 return false;
14042
14043 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14044 !isa<ConstantSDNode>(Op->getOperand(0)))
14045 return false;
14046
14047 SplatVal = Op->getConstantOperandVal(0);
14048 if (Op.getValueType().getVectorElementType() != MVT::i64)
14049 SplatVal = (int32_t)SplatVal;
14050
14051 Negated = false;
14052 if (isPowerOf2_64(SplatVal))
14053 return true;
14054
14055 Negated = true;
14056 if (isPowerOf2_64(-SplatVal)) {
14057 SplatVal = -SplatVal;
14058 return true;
14059 }
14060
14061 return false;
14062}
14063
14064SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14065 EVT VT = Op.getValueType();
14066 SDLoc dl(Op);
14067
14068 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14069 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14070
14071 assert(VT.isScalableVector() && "Expected a scalable vector.");
14072
14073 bool Signed = Op.getOpcode() == ISD::SDIV;
14074 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14075
14076 bool Negated;
14077 uint64_t SplatVal;
14078 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14079 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14080 SDValue Res =
14081 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14082 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14083 if (Negated)
14084 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14085
14086 return Res;
14087 }
14088
14089 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14090 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14091
14092 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14093 // operations, and truncate the result.
14094 EVT WidenedVT;
14095 if (VT == MVT::nxv16i8)
14096 WidenedVT = MVT::nxv8i16;
14097 else if (VT == MVT::nxv8i16)
14098 WidenedVT = MVT::nxv4i32;
14099 else
14100 llvm_unreachable("Unexpected Custom DIV operation");
14101
14102 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14103 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14104 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14105 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14106 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14107 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14108 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14109 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14110 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14111}
14112
14114 // Currently no fixed length shuffles that require SVE are legal.
14115 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14116 return false;
14117
14118 if (VT.getVectorNumElements() == 4 &&
14119 (VT.is128BitVector() || VT.is64BitVector())) {
14120 unsigned Cost = getPerfectShuffleCost(M);
14121 if (Cost <= 1)
14122 return true;
14123 }
14124
14125 bool DummyBool;
14126 int DummyInt;
14127 unsigned DummyUnsigned;
14128
14129 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
14130 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
14131 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14132 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
14133 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
14134 isZIPMask(M, VT, DummyUnsigned) ||
14135 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14136 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14137 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14138 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
14139 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14140}
14141
14143 EVT VT) const {
14144 // Just delegate to the generic legality, clear masks aren't special.
14145 return isShuffleMaskLegal(M, VT);
14146}
14147
14148/// getVShiftImm - Check if this is a valid build_vector for the immediate
14149/// operand of a vector shift operation, where all the elements of the
14150/// build_vector must have the same constant integer value.
14151static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14152 // Ignore bit_converts.
14153 while (Op.getOpcode() == ISD::BITCAST)
14154 Op = Op.getOperand(0);
14155 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14156 APInt SplatBits, SplatUndef;
14157 unsigned SplatBitSize;
14158 bool HasAnyUndefs;
14159 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14160 HasAnyUndefs, ElementBits) ||
14161 SplatBitSize > ElementBits)
14162 return false;
14163 Cnt = SplatBits.getSExtValue();
14164 return true;
14165}
14166
14167/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14168/// operand of a vector shift left operation. That value must be in the range:
14169/// 0 <= Value < ElementBits for a left shift; or
14170/// 0 <= Value <= ElementBits for a long left shift.
14171static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14172 assert(VT.isVector() && "vector shift count is not a vector type");
14173 int64_t ElementBits = VT.getScalarSizeInBits();
14174 if (!getVShiftImm(Op, ElementBits, Cnt))
14175 return false;
14176 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14177}
14178
14179/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14180/// operand of a vector shift right operation. The value must be in the range:
14181/// 1 <= Value <= ElementBits for a right shift; or
14182static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14183 assert(VT.isVector() && "vector shift count is not a vector type");
14184 int64_t ElementBits = VT.getScalarSizeInBits();
14185 if (!getVShiftImm(Op, ElementBits, Cnt))
14186 return false;
14187 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14188}
14189
14190SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14191 SelectionDAG &DAG) const {
14192 EVT VT = Op.getValueType();
14193
14194 if (VT.getScalarType() == MVT::i1) {
14195 // Lower i1 truncate to `(x & 1) != 0`.
14196 SDLoc dl(Op);
14197 EVT OpVT = Op.getOperand(0).getValueType();
14198 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14199 SDValue One = DAG.getConstant(1, dl, OpVT);
14200 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14201 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14202 }
14203
14204 if (!VT.isVector() || VT.isScalableVector())
14205 return SDValue();
14206
14207 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14208 !Subtarget->isNeonAvailable()))
14209 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14210
14211 return SDValue();
14212}
14213
14214// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14215// possibly a truncated type, it tells how many bits of the value are to be
14216// used.
14218 SelectionDAG &DAG,
14219 unsigned &ShiftValue,
14220 SDValue &RShOperand) {
14221 if (Shift->getOpcode() != ISD::SRL)
14222 return false;
14223
14224 EVT VT = Shift.getValueType();
14225 assert(VT.isScalableVT());
14226
14227 auto ShiftOp1 =
14228 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14229 if (!ShiftOp1)
14230 return false;
14231
14232 ShiftValue = ShiftOp1->getZExtValue();
14233 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14234 return false;
14235
14236 SDValue Add = Shift->getOperand(0);
14237 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14238 return false;
14239
14241 "ResVT must be truncated or same type as the shift.");
14242 // Check if an overflow can lead to incorrect results.
14243 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14244 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14245 return false;
14246
14247 auto AddOp1 =
14248 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14249 if (!AddOp1)
14250 return false;
14251 uint64_t AddValue = AddOp1->getZExtValue();
14252 if (AddValue != 1ULL << (ShiftValue - 1))
14253 return false;
14254
14255 RShOperand = Add->getOperand(0);
14256 return true;
14257}
14258
14259SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14260 SelectionDAG &DAG) const {
14261 EVT VT = Op.getValueType();
14262 SDLoc DL(Op);
14263 int64_t Cnt;
14264
14265 if (!Op.getOperand(1).getValueType().isVector())
14266 return Op;
14267 unsigned EltSize = VT.getScalarSizeInBits();
14268
14269 switch (Op.getOpcode()) {
14270 case ISD::SHL:
14271 if (VT.isScalableVector() ||
14273 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14274
14275 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14276 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14277 DAG.getConstant(Cnt, DL, MVT::i32));
14278 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14279 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14280 MVT::i32),
14281 Op.getOperand(0), Op.getOperand(1));
14282 case ISD::SRA:
14283 case ISD::SRL:
14284 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14285 SDValue RShOperand;
14286 unsigned ShiftValue;
14287 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14288 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14289 getPredicateForVector(DAG, DL, VT), RShOperand,
14290 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14291 }
14292
14293 if (VT.isScalableVector() ||
14294 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14295 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14297 return LowerToPredicatedOp(Op, DAG, Opc);
14298 }
14299
14300 // Right shift immediate
14301 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14302 unsigned Opc =
14303 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14304 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14305 DAG.getConstant(Cnt, DL, MVT::i32));
14306 }
14307
14308 // Right shift register. Note, there is not a shift right register
14309 // instruction, but the shift left register instruction takes a signed
14310 // value, where negative numbers specify a right shift.
14311 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14312 : Intrinsic::aarch64_neon_ushl;
14313 // negate the shift amount
14314 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14315 Op.getOperand(1));
14316 SDValue NegShiftLeft =
14318 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14319 NegShift);
14320 return NegShiftLeft;
14321 }
14322
14323 llvm_unreachable("unexpected shift opcode");
14324}
14325
14327 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14328 const SDLoc &dl, SelectionDAG &DAG) {
14329 EVT SrcVT = LHS.getValueType();
14330 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14331 "function only supposed to emit natural comparisons");
14332
14333 APInt SplatValue;
14334 APInt SplatUndef;
14335 unsigned SplatBitSize = 0;
14336 bool HasAnyUndefs;
14337
14338 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14339 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14340 SplatBitSize, HasAnyUndefs);
14341
14342 bool IsZero = IsCnst && SplatValue == 0;
14343 bool IsOne =
14344 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14345 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14346
14347 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14348 switch (CC) {
14349 default:
14350 return SDValue();
14351 case AArch64CC::NE: {
14352 SDValue Fcmeq;
14353 if (IsZero)
14354 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14355 else
14356 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14357 return DAG.getNOT(dl, Fcmeq, VT);
14358 }
14359 case AArch64CC::EQ:
14360 if (IsZero)
14361 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14362 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14363 case AArch64CC::GE:
14364 if (IsZero)
14365 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14366 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14367 case AArch64CC::GT:
14368 if (IsZero)
14369 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14370 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14371 case AArch64CC::LE:
14372 if (!NoNans)
14373 return SDValue();
14374 // If we ignore NaNs then we can use to the LS implementation.
14375 [[fallthrough]];
14376 case AArch64CC::LS:
14377 if (IsZero)
14378 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14379 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14380 case AArch64CC::LT:
14381 if (!NoNans)
14382 return SDValue();
14383 // If we ignore NaNs then we can use to the MI implementation.
14384 [[fallthrough]];
14385 case AArch64CC::MI:
14386 if (IsZero)
14387 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14388 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14389 }
14390 }
14391
14392 switch (CC) {
14393 default:
14394 return SDValue();
14395 case AArch64CC::NE: {
14396 SDValue Cmeq;
14397 if (IsZero)
14398 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14399 else
14400 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14401 return DAG.getNOT(dl, Cmeq, VT);
14402 }
14403 case AArch64CC::EQ:
14404 if (IsZero)
14405 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14406 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14407 case AArch64CC::GE:
14408 if (IsZero)
14409 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14410 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14411 case AArch64CC::GT:
14412 if (IsZero)
14413 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14414 if (IsMinusOne)
14415 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14416 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14417 case AArch64CC::LE:
14418 if (IsZero)
14419 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14420 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14421 case AArch64CC::LS:
14422 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14423 case AArch64CC::LO:
14424 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14425 case AArch64CC::LT:
14426 if (IsZero)
14427 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14428 if (IsOne)
14429 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14430 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14431 case AArch64CC::HI:
14432 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14433 case AArch64CC::HS:
14434 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14435 }
14436}
14437
14438SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14439 SelectionDAG &DAG) const {
14440 if (Op.getValueType().isScalableVector())
14441 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14442
14443 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14444 !Subtarget->isNeonAvailable()))
14445 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14446
14447 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14448 SDValue LHS = Op.getOperand(0);
14449 SDValue RHS = Op.getOperand(1);
14450 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14451 SDLoc dl(Op);
14452
14453 if (LHS.getValueType().getVectorElementType().isInteger()) {
14454 assert(LHS.getValueType() == RHS.getValueType());
14456 SDValue Cmp =
14457 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14458 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14459 }
14460
14461 // Lower isnan(x) | isnan(never-nan) to x != x.
14462 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14463 if (CC == ISD::SETUO || CC == ISD::SETO) {
14464 bool OneNaN = false;
14465 if (LHS == RHS) {
14466 OneNaN = true;
14467 } else if (DAG.isKnownNeverNaN(RHS)) {
14468 OneNaN = true;
14469 RHS = LHS;
14470 } else if (DAG.isKnownNeverNaN(LHS)) {
14471 OneNaN = true;
14472 LHS = RHS;
14473 }
14474 if (OneNaN) {
14476 }
14477 }
14478
14479 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14480
14481 // Make v4f16 (only) fcmp operations utilise vector instructions
14482 // v8f16 support will be a litle more complicated
14483 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14484 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14485 if (LHS.getValueType().getVectorNumElements() == 4) {
14486 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14487 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14488 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14489 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14490 CmpVT = MVT::v4i32;
14491 } else
14492 return SDValue();
14493 }
14494
14495 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14496 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14497 LHS.getValueType().getVectorElementType() != MVT::f128);
14498
14499 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14500 // clean. Some of them require two branches to implement.
14501 AArch64CC::CondCode CC1, CC2;
14502 bool ShouldInvert;
14503 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14504
14505 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14506 SDValue Cmp =
14507 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14508 if (!Cmp.getNode())
14509 return SDValue();
14510
14511 if (CC2 != AArch64CC::AL) {
14512 SDValue Cmp2 =
14513 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14514 if (!Cmp2.getNode())
14515 return SDValue();
14516
14517 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14518 }
14519
14520 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14521
14522 if (ShouldInvert)
14523 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14524
14525 return Cmp;
14526}
14527
14528static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14529 SelectionDAG &DAG) {
14530 SDValue VecOp = ScalarOp.getOperand(0);
14531 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14532 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14533 DAG.getConstant(0, DL, MVT::i64));
14534}
14535
14536static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14537 SDLoc DL, SelectionDAG &DAG) {
14538 unsigned ScalarOpcode;
14539 switch (Opcode) {
14540 case ISD::VECREDUCE_AND:
14541 ScalarOpcode = ISD::AND;
14542 break;
14543 case ISD::VECREDUCE_OR:
14544 ScalarOpcode = ISD::OR;
14545 break;
14546 case ISD::VECREDUCE_XOR:
14547 ScalarOpcode = ISD::XOR;
14548 break;
14549 default:
14550 llvm_unreachable("Expected bitwise vector reduction");
14551 return SDValue();
14552 }
14553
14554 EVT VecVT = Vec.getValueType();
14555 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14556 "Expected power-of-2 length vector");
14557
14558 EVT ElemVT = VecVT.getVectorElementType();
14559
14560 SDValue Result;
14561 unsigned NumElems = VecVT.getVectorNumElements();
14562
14563 // Special case for boolean reductions
14564 if (ElemVT == MVT::i1) {
14565 // Split large vectors into smaller ones
14566 if (NumElems > 16) {
14567 SDValue Lo, Hi;
14568 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14569 EVT HalfVT = Lo.getValueType();
14570 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14571 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14572 }
14573
14574 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14575 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14576 // this element size leads to the best codegen, since e.g. setcc results
14577 // might need to be truncated otherwise.
14578 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14579
14580 // any_ext doesn't work with umin/umax, so only use it for uadd.
14581 unsigned ExtendOp =
14582 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14583 SDValue Extended = DAG.getNode(
14584 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14585 switch (ScalarOpcode) {
14586 case ISD::AND:
14587 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14588 break;
14589 case ISD::OR:
14590 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14591 break;
14592 case ISD::XOR:
14593 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14594 break;
14595 default:
14596 llvm_unreachable("Unexpected Opcode");
14597 }
14598
14599 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14600 } else {
14601 // Iteratively split the vector in half and combine using the bitwise
14602 // operation until it fits in a 64 bit register.
14603 while (VecVT.getSizeInBits() > 64) {
14604 SDValue Lo, Hi;
14605 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14606 VecVT = Lo.getValueType();
14607 NumElems = VecVT.getVectorNumElements();
14608 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14609 }
14610
14611 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14612
14613 // Do the remaining work on a scalar since it allows the code generator to
14614 // combine the shift and bitwise operation into one instruction and since
14615 // integer instructions can have higher throughput than vector instructions.
14616 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14617
14618 // Iteratively combine the lower and upper halves of the scalar using the
14619 // bitwise operation, halving the relevant region of the scalar in each
14620 // iteration, until the relevant region is just one element of the original
14621 // vector.
14622 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14623 SDValue ShiftAmount =
14624 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14625 SDValue Shifted =
14626 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14627 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14628 }
14629
14630 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14631 }
14632
14633 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14634}
14635
14636SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14637 SelectionDAG &DAG) const {
14638 SDValue Src = Op.getOperand(0);
14639
14640 // Try to lower fixed length reductions to SVE.
14641 EVT SrcVT = Src.getValueType();
14642 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14643 Op.getOpcode() == ISD::VECREDUCE_AND ||
14644 Op.getOpcode() == ISD::VECREDUCE_OR ||
14645 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14646 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14647 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14648 SrcVT.getVectorElementType() == MVT::i64);
14649 if (SrcVT.isScalableVector() ||
14651 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14652
14653 if (SrcVT.getVectorElementType() == MVT::i1)
14654 return LowerPredReductionToSVE(Op, DAG);
14655
14656 switch (Op.getOpcode()) {
14657 case ISD::VECREDUCE_ADD:
14658 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14659 case ISD::VECREDUCE_AND:
14660 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14661 case ISD::VECREDUCE_OR:
14662 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14664 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14666 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14668 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14670 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14671 case ISD::VECREDUCE_XOR:
14672 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14674 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14676 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14678 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14680 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14682 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14683 default:
14684 llvm_unreachable("Unhandled fixed length reduction");
14685 }
14686 }
14687
14688 // Lower NEON reductions.
14689 SDLoc dl(Op);
14690 switch (Op.getOpcode()) {
14691 case ISD::VECREDUCE_AND:
14692 case ISD::VECREDUCE_OR:
14693 case ISD::VECREDUCE_XOR:
14694 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14695 Op.getValueType(), dl, DAG);
14696 case ISD::VECREDUCE_ADD:
14697 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14699 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14701 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14703 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14705 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14706 default:
14707 llvm_unreachable("Unhandled reduction");
14708 }
14709}
14710
14711SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14712 SelectionDAG &DAG) const {
14713 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14714 // No point replacing if we don't have the relevant instruction/libcall anyway
14715 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14716 return SDValue();
14717
14718 // LSE has an atomic load-clear instruction, but not a load-and.
14719 SDLoc dl(Op);
14720 MVT VT = Op.getSimpleValueType();
14721 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14722 SDValue RHS = Op.getOperand(2);
14723 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14724 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14725 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14726 Op.getOperand(0), Op.getOperand(1), RHS,
14727 AN->getMemOperand());
14728}
14729
14730SDValue
14731AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14732 SelectionDAG &DAG) const {
14733
14734 SDLoc dl(Op);
14735 // Get the inputs.
14736 SDNode *Node = Op.getNode();
14737 SDValue Chain = Op.getOperand(0);
14738 SDValue Size = Op.getOperand(1);
14740 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14741 EVT VT = Node->getValueType(0);
14742
14744 "no-stack-arg-probe")) {
14745 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14746 Chain = SP.getValue(1);
14747 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14748 if (Align)
14749 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14750 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14751 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14752 SDValue Ops[2] = {SP, Chain};
14753 return DAG.getMergeValues(Ops, dl);
14754 }
14755
14756 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14757
14758 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14760 PtrVT, 0);
14761
14762 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14763 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14764 if (Subtarget->hasCustomCallingConv())
14765 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14766
14767 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14768 DAG.getConstant(4, dl, MVT::i64));
14769 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14770 Chain =
14771 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14772 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14773 DAG.getRegisterMask(Mask), Chain.getValue(1));
14774 // To match the actual intent better, we should read the output from X15 here
14775 // again (instead of potentially spilling it to the stack), but rereading Size
14776 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14777 // here.
14778
14779 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14780 DAG.getConstant(4, dl, MVT::i64));
14781
14782 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14783 Chain = SP.getValue(1);
14784 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14785 if (Align)
14786 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14787 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14788 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14789
14790 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14791
14792 SDValue Ops[2] = {SP, Chain};
14793 return DAG.getMergeValues(Ops, dl);
14794}
14795
14796SDValue
14797AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14798 SelectionDAG &DAG) const {
14799 // Get the inputs.
14800 SDNode *Node = Op.getNode();
14801 SDValue Chain = Op.getOperand(0);
14802 SDValue Size = Op.getOperand(1);
14803
14805 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14806 SDLoc dl(Op);
14807 EVT VT = Node->getValueType(0);
14808
14809 // Construct the new SP value in a GPR.
14810 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14811 Chain = SP.getValue(1);
14812 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14813 if (Align)
14814 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14815 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14816
14817 // Set the real SP to the new value with a probing loop.
14818 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14819 SDValue Ops[2] = {SP, Chain};
14820 return DAG.getMergeValues(Ops, dl);
14821}
14822
14823SDValue
14824AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14825 SelectionDAG &DAG) const {
14827
14828 if (Subtarget->isTargetWindows())
14829 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14830 else if (hasInlineStackProbe(MF))
14831 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14832 else
14833 return SDValue();
14834}
14835
14836// When x and y are extended, lower:
14837// avgfloor(x, y) -> (x + y) >> 1
14838// avgceil(x, y) -> (x + y + 1) >> 1
14839
14840// Otherwise, lower to:
14841// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14842// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14843SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14844 unsigned NewOp) const {
14845 if (Subtarget->hasSVE2())
14846 return LowerToPredicatedOp(Op, DAG, NewOp);
14847
14848 SDLoc dl(Op);
14849 SDValue OpA = Op->getOperand(0);
14850 SDValue OpB = Op->getOperand(1);
14851 EVT VT = Op.getValueType();
14852 bool IsCeil =
14853 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14854 bool IsSigned =
14855 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14856 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14857
14858 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14859
14860 auto IsZeroExtended = [&DAG](SDValue &Node) {
14861 KnownBits Known = DAG.computeKnownBits(Node, 0);
14862 return Known.Zero.isSignBitSet();
14863 };
14864
14865 auto IsSignExtended = [&DAG](SDValue &Node) {
14866 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14867 };
14868
14869 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14870 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14871 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14872 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14873 if (IsCeil)
14874 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14875 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14876 }
14877
14878 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14879 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14880
14881 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14882 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14883 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14884 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14885}
14886
14887SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14888 SelectionDAG &DAG) const {
14889 EVT VT = Op.getValueType();
14890 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14891
14892 SDLoc DL(Op);
14893 APInt MulImm = Op.getConstantOperandAPInt(0);
14894 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14895 VT);
14896}
14897
14898/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14899template <unsigned NumVecs>
14900static bool
14904 // Retrieve EC from first vector argument.
14905 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14907#ifndef NDEBUG
14908 // Check the assumption that all input vectors are the same type.
14909 for (unsigned I = 0; I < NumVecs; ++I)
14910 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14911 "Invalid type.");
14912#endif
14913 // memVT is `NumVecs * VT`.
14915 EC * NumVecs);
14916 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14917 Info.offset = 0;
14918 Info.align.reset();
14920 return true;
14921}
14922
14923/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14924/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14925/// specified in the intrinsic calls.
14927 const CallInst &I,
14928 MachineFunction &MF,
14929 unsigned Intrinsic) const {
14930 auto &DL = I.getModule()->getDataLayout();
14931 switch (Intrinsic) {
14932 case Intrinsic::aarch64_sve_st2:
14933 return setInfoSVEStN<2>(*this, DL, Info, I);
14934 case Intrinsic::aarch64_sve_st3:
14935 return setInfoSVEStN<3>(*this, DL, Info, I);
14936 case Intrinsic::aarch64_sve_st4:
14937 return setInfoSVEStN<4>(*this, DL, Info, I);
14938 case Intrinsic::aarch64_neon_ld2:
14939 case Intrinsic::aarch64_neon_ld3:
14940 case Intrinsic::aarch64_neon_ld4:
14941 case Intrinsic::aarch64_neon_ld1x2:
14942 case Intrinsic::aarch64_neon_ld1x3:
14943 case Intrinsic::aarch64_neon_ld1x4: {
14945 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14946 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14947 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14948 Info.offset = 0;
14949 Info.align.reset();
14950 // volatile loads with NEON intrinsics not supported
14952 return true;
14953 }
14954 case Intrinsic::aarch64_neon_ld2lane:
14955 case Intrinsic::aarch64_neon_ld3lane:
14956 case Intrinsic::aarch64_neon_ld4lane:
14957 case Intrinsic::aarch64_neon_ld2r:
14958 case Intrinsic::aarch64_neon_ld3r:
14959 case Intrinsic::aarch64_neon_ld4r: {
14961 // ldx return struct with the same vec type
14962 Type *RetTy = I.getType();
14963 auto *StructTy = cast<StructType>(RetTy);
14964 unsigned NumElts = StructTy->getNumElements();
14965 Type *VecTy = StructTy->getElementType(0);
14966 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14967 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14968 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14969 Info.offset = 0;
14970 Info.align.reset();
14971 // volatile loads with NEON intrinsics not supported
14973 return true;
14974 }
14975 case Intrinsic::aarch64_neon_st2:
14976 case Intrinsic::aarch64_neon_st3:
14977 case Intrinsic::aarch64_neon_st4:
14978 case Intrinsic::aarch64_neon_st1x2:
14979 case Intrinsic::aarch64_neon_st1x3:
14980 case Intrinsic::aarch64_neon_st1x4: {
14982 unsigned NumElts = 0;
14983 for (const Value *Arg : I.args()) {
14984 Type *ArgTy = Arg->getType();
14985 if (!ArgTy->isVectorTy())
14986 break;
14987 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14988 }
14989 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14990 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14991 Info.offset = 0;
14992 Info.align.reset();
14993 // volatile stores with NEON intrinsics not supported
14995 return true;
14996 }
14997 case Intrinsic::aarch64_neon_st2lane:
14998 case Intrinsic::aarch64_neon_st3lane:
14999 case Intrinsic::aarch64_neon_st4lane: {
15001 unsigned NumElts = 0;
15002 // all the vector type is same
15003 Type *VecTy = I.getArgOperand(0)->getType();
15004 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15005
15006 for (const Value *Arg : I.args()) {
15007 Type *ArgTy = Arg->getType();
15008 if (!ArgTy->isVectorTy())
15009 break;
15010 NumElts += 1;
15011 }
15012
15013 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15014 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15015 Info.offset = 0;
15016 Info.align.reset();
15017 // volatile stores with NEON intrinsics not supported
15019 return true;
15020 }
15021 case Intrinsic::aarch64_ldaxr:
15022 case Intrinsic::aarch64_ldxr: {
15023 Type *ValTy = I.getParamElementType(0);
15025 Info.memVT = MVT::getVT(ValTy);
15026 Info.ptrVal = I.getArgOperand(0);
15027 Info.offset = 0;
15028 Info.align = DL.getABITypeAlign(ValTy);
15030 return true;
15031 }
15032 case Intrinsic::aarch64_stlxr:
15033 case Intrinsic::aarch64_stxr: {
15034 Type *ValTy = I.getParamElementType(1);
15036 Info.memVT = MVT::getVT(ValTy);
15037 Info.ptrVal = I.getArgOperand(1);
15038 Info.offset = 0;
15039 Info.align = DL.getABITypeAlign(ValTy);
15041 return true;
15042 }
15043 case Intrinsic::aarch64_ldaxp:
15044 case Intrinsic::aarch64_ldxp:
15046 Info.memVT = MVT::i128;
15047 Info.ptrVal = I.getArgOperand(0);
15048 Info.offset = 0;
15049 Info.align = Align(16);
15051 return true;
15052 case Intrinsic::aarch64_stlxp:
15053 case Intrinsic::aarch64_stxp:
15055 Info.memVT = MVT::i128;
15056 Info.ptrVal = I.getArgOperand(2);
15057 Info.offset = 0;
15058 Info.align = Align(16);
15060 return true;
15061 case Intrinsic::aarch64_sve_ldnt1: {
15062 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15064 Info.memVT = MVT::getVT(I.getType());
15065 Info.ptrVal = I.getArgOperand(1);
15066 Info.offset = 0;
15067 Info.align = DL.getABITypeAlign(ElTy);
15069 return true;
15070 }
15071 case Intrinsic::aarch64_sve_stnt1: {
15072 Type *ElTy =
15073 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15075 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15076 Info.ptrVal = I.getArgOperand(2);
15077 Info.offset = 0;
15078 Info.align = DL.getABITypeAlign(ElTy);
15080 return true;
15081 }
15082 case Intrinsic::aarch64_mops_memset_tag: {
15083 Value *Dst = I.getArgOperand(0);
15084 Value *Val = I.getArgOperand(1);
15086 Info.memVT = MVT::getVT(Val->getType());
15087 Info.ptrVal = Dst;
15088 Info.offset = 0;
15089 Info.align = I.getParamAlign(0).valueOrOne();
15091 // The size of the memory being operated on is unknown at this point
15093 return true;
15094 }
15095 default:
15096 break;
15097 }
15098
15099 return false;
15100}
15101
15103 ISD::LoadExtType ExtTy,
15104 EVT NewVT) const {
15105 // TODO: This may be worth removing. Check regression tests for diffs.
15106 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15107 return false;
15108
15109 // If we're reducing the load width in order to avoid having to use an extra
15110 // instruction to do extension then it's probably a good idea.
15111 if (ExtTy != ISD::NON_EXTLOAD)
15112 return true;
15113 // Don't reduce load width if it would prevent us from combining a shift into
15114 // the offset.
15115 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15116 assert(Mem);
15117 const SDValue &Base = Mem->getBasePtr();
15118 if (Base.getOpcode() == ISD::ADD &&
15119 Base.getOperand(1).getOpcode() == ISD::SHL &&
15120 Base.getOperand(1).hasOneUse() &&
15121 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15122 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15123 if (Mem->getMemoryVT().isScalableVector())
15124 return false;
15125 // The shift can be combined if it matches the size of the value being
15126 // loaded (and so reducing the width would make it not match).
15127 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15128 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15129 if (ShiftAmount == Log2_32(LoadBytes))
15130 return false;
15131 }
15132 // We have no reason to disallow reducing the load width, so allow it.
15133 return true;
15134}
15135
15136// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15138 EVT VT = Extend.getValueType();
15139 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15140 SDValue Extract = Extend.getOperand(0);
15141 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15142 Extract = Extract.getOperand(0);
15143 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15144 EVT VecVT = Extract.getOperand(0).getValueType();
15145 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15146 return false;
15147 }
15148 }
15149 return true;
15150}
15151
15152// Truncations from 64-bit GPR to 32-bit GPR is free.
15154 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15155 return false;
15156 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15157 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15158 return NumBits1 > NumBits2;
15159}
15161 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15162 return false;
15163 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15164 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15165 return NumBits1 > NumBits2;
15166}
15167
15168/// Check if it is profitable to hoist instruction in then/else to if.
15169/// Not profitable if I and it's user can form a FMA instruction
15170/// because we prefer FMSUB/FMADD.
15172 if (I->getOpcode() != Instruction::FMul)
15173 return true;
15174
15175 if (!I->hasOneUse())
15176 return true;
15177
15178 Instruction *User = I->user_back();
15179
15180 if (!(User->getOpcode() == Instruction::FSub ||
15181 User->getOpcode() == Instruction::FAdd))
15182 return true;
15183
15185 const Function *F = I->getFunction();
15186 const DataLayout &DL = F->getParent()->getDataLayout();
15187 Type *Ty = User->getOperand(0)->getType();
15188
15189 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15191 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15192 Options.UnsafeFPMath));
15193}
15194
15195// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15196// 64-bit GPR.
15198 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15199 return false;
15200 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15201 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15202 return NumBits1 == 32 && NumBits2 == 64;
15203}
15205 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15206 return false;
15207 unsigned NumBits1 = VT1.getSizeInBits();
15208 unsigned NumBits2 = VT2.getSizeInBits();
15209 return NumBits1 == 32 && NumBits2 == 64;
15210}
15211
15213 EVT VT1 = Val.getValueType();
15214 if (isZExtFree(VT1, VT2)) {
15215 return true;
15216 }
15217
15218 if (Val.getOpcode() != ISD::LOAD)
15219 return false;
15220
15221 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15222 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15223 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15224 VT1.getSizeInBits() <= 32);
15225}
15226
15227bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15228 if (isa<FPExtInst>(Ext))
15229 return false;
15230
15231 // Vector types are not free.
15232 if (Ext->getType()->isVectorTy())
15233 return false;
15234
15235 for (const Use &U : Ext->uses()) {
15236 // The extension is free if we can fold it with a left shift in an
15237 // addressing mode or an arithmetic operation: add, sub, and cmp.
15238
15239 // Is there a shift?
15240 const Instruction *Instr = cast<Instruction>(U.getUser());
15241
15242 // Is this a constant shift?
15243 switch (Instr->getOpcode()) {
15244 case Instruction::Shl:
15245 if (!isa<ConstantInt>(Instr->getOperand(1)))
15246 return false;
15247 break;
15248 case Instruction::GetElementPtr: {
15249 gep_type_iterator GTI = gep_type_begin(Instr);
15250 auto &DL = Ext->getModule()->getDataLayout();
15251 std::advance(GTI, U.getOperandNo()-1);
15252 Type *IdxTy = GTI.getIndexedType();
15253 // This extension will end up with a shift because of the scaling factor.
15254 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15255 // Get the shift amount based on the scaling factor:
15256 // log2(sizeof(IdxTy)) - log2(8).
15257 if (IdxTy->isScalableTy())
15258 return false;
15259 uint64_t ShiftAmt =
15260 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15261 3;
15262 // Is the constant foldable in the shift of the addressing mode?
15263 // I.e., shift amount is between 1 and 4 inclusive.
15264 if (ShiftAmt == 0 || ShiftAmt > 4)
15265 return false;
15266 break;
15267 }
15268 case Instruction::Trunc:
15269 // Check if this is a noop.
15270 // trunc(sext ty1 to ty2) to ty1.
15271 if (Instr->getType() == Ext->getOperand(0)->getType())
15272 continue;
15273 [[fallthrough]];
15274 default:
15275 return false;
15276 }
15277
15278 // At this point we can use the bfm family, so this extension is free
15279 // for that use.
15280 }
15281 return true;
15282}
15283
15284static bool isSplatShuffle(Value *V) {
15285 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15286 return all_equal(Shuf->getShuffleMask());
15287 return false;
15288}
15289
15290/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15291/// or upper half of the vector elements.
15292static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15293 bool AllowSplat = false) {
15294 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15295 auto *FullTy = FullV->getType();
15296 auto *HalfTy = HalfV->getType();
15297 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15298 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15299 };
15300
15301 auto extractHalf = [](Value *FullV, Value *HalfV) {
15302 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15303 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15304 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15305 };
15306
15307 ArrayRef<int> M1, M2;
15308 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15309 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15310 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15311 return false;
15312
15313 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15314 // it is not checked as an extract below.
15315 if (AllowSplat && isSplatShuffle(Op1))
15316 S1Op1 = nullptr;
15317 if (AllowSplat && isSplatShuffle(Op2))
15318 S2Op1 = nullptr;
15319
15320 // Check that the operands are half as wide as the result and we extract
15321 // half of the elements of the input vectors.
15322 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15323 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15324 return false;
15325
15326 // Check the mask extracts either the lower or upper half of vector
15327 // elements.
15328 int M1Start = 0;
15329 int M2Start = 0;
15330 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15331 if ((S1Op1 &&
15332 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15333 (S2Op1 &&
15334 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15335 return false;
15336
15337 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15338 (M2Start != 0 && M2Start != (NumElements / 2)))
15339 return false;
15340 if (S1Op1 && S2Op1 && M1Start != M2Start)
15341 return false;
15342
15343 return true;
15344}
15345
15346/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15347/// of the vector elements.
15348static bool areExtractExts(Value *Ext1, Value *Ext2) {
15349 auto areExtDoubled = [](Instruction *Ext) {
15350 return Ext->getType()->getScalarSizeInBits() ==
15351 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15352 };
15353
15354 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15355 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15356 !areExtDoubled(cast<Instruction>(Ext1)) ||
15357 !areExtDoubled(cast<Instruction>(Ext2)))
15358 return false;
15359
15360 return true;
15361}
15362
15363/// Check if Op could be used with vmull_high_p64 intrinsic.
15365 Value *VectorOperand = nullptr;
15366 ConstantInt *ElementIndex = nullptr;
15367 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15368 m_ConstantInt(ElementIndex))) &&
15369 ElementIndex->getValue() == 1 &&
15370 isa<FixedVectorType>(VectorOperand->getType()) &&
15371 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15372}
15373
15374/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15375static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15377}
15378
15380 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15381 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15382 if (!GEP || GEP->getNumOperands() != 2)
15383 return false;
15384
15385 Value *Base = GEP->getOperand(0);
15386 Value *Offsets = GEP->getOperand(1);
15387
15388 // We only care about scalar_base+vector_offsets.
15389 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15390 return false;
15391
15392 // Sink extends that would allow us to use 32-bit offset vectors.
15393 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15394 auto *OffsetsInst = cast<Instruction>(Offsets);
15395 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15396 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15397 Ops.push_back(&GEP->getOperandUse(1));
15398 }
15399
15400 // Sink the GEP.
15401 return true;
15402}
15403
15404/// We want to sink following cases:
15405/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15407 if (match(Op, m_VScale()))
15408 return true;
15409 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15411 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15412 return true;
15413 }
15414 return false;
15415}
15416
15417/// Check if sinking \p I's operands to I's basic block is profitable, because
15418/// the operands can be folded into a target instruction, e.g.
15419/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15421 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15422 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15423 switch (II->getIntrinsicID()) {
15424 case Intrinsic::aarch64_neon_smull:
15425 case Intrinsic::aarch64_neon_umull:
15426 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15427 /*AllowSplat=*/true)) {
15428 Ops.push_back(&II->getOperandUse(0));
15429 Ops.push_back(&II->getOperandUse(1));
15430 return true;
15431 }
15432 [[fallthrough]];
15433
15434 case Intrinsic::fma:
15435 if (isa<VectorType>(I->getType()) &&
15436 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15437 !Subtarget->hasFullFP16())
15438 return false;
15439 [[fallthrough]];
15440 case Intrinsic::aarch64_neon_sqdmull:
15441 case Intrinsic::aarch64_neon_sqdmulh:
15442 case Intrinsic::aarch64_neon_sqrdmulh:
15443 // Sink splats for index lane variants
15444 if (isSplatShuffle(II->getOperand(0)))
15445 Ops.push_back(&II->getOperandUse(0));
15446 if (isSplatShuffle(II->getOperand(1)))
15447 Ops.push_back(&II->getOperandUse(1));
15448 return !Ops.empty();
15449 case Intrinsic::aarch64_neon_fmlal:
15450 case Intrinsic::aarch64_neon_fmlal2:
15451 case Intrinsic::aarch64_neon_fmlsl:
15452 case Intrinsic::aarch64_neon_fmlsl2:
15453 // Sink splats for index lane variants
15454 if (isSplatShuffle(II->getOperand(1)))
15455 Ops.push_back(&II->getOperandUse(1));
15456 if (isSplatShuffle(II->getOperand(2)))
15457 Ops.push_back(&II->getOperandUse(2));
15458 return !Ops.empty();
15459 case Intrinsic::aarch64_sve_ptest_first:
15460 case Intrinsic::aarch64_sve_ptest_last:
15461 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15462 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15463 Ops.push_back(&II->getOperandUse(0));
15464 return !Ops.empty();
15465 case Intrinsic::aarch64_sme_write_horiz:
15466 case Intrinsic::aarch64_sme_write_vert:
15467 case Intrinsic::aarch64_sme_writeq_horiz:
15468 case Intrinsic::aarch64_sme_writeq_vert: {
15469 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15470 if (!Idx || Idx->getOpcode() != Instruction::Add)
15471 return false;
15472 Ops.push_back(&II->getOperandUse(1));
15473 return true;
15474 }
15475 case Intrinsic::aarch64_sme_read_horiz:
15476 case Intrinsic::aarch64_sme_read_vert:
15477 case Intrinsic::aarch64_sme_readq_horiz:
15478 case Intrinsic::aarch64_sme_readq_vert:
15479 case Intrinsic::aarch64_sme_ld1b_vert:
15480 case Intrinsic::aarch64_sme_ld1h_vert:
15481 case Intrinsic::aarch64_sme_ld1w_vert:
15482 case Intrinsic::aarch64_sme_ld1d_vert:
15483 case Intrinsic::aarch64_sme_ld1q_vert:
15484 case Intrinsic::aarch64_sme_st1b_vert:
15485 case Intrinsic::aarch64_sme_st1h_vert:
15486 case Intrinsic::aarch64_sme_st1w_vert:
15487 case Intrinsic::aarch64_sme_st1d_vert:
15488 case Intrinsic::aarch64_sme_st1q_vert:
15489 case Intrinsic::aarch64_sme_ld1b_horiz:
15490 case Intrinsic::aarch64_sme_ld1h_horiz:
15491 case Intrinsic::aarch64_sme_ld1w_horiz:
15492 case Intrinsic::aarch64_sme_ld1d_horiz:
15493 case Intrinsic::aarch64_sme_ld1q_horiz:
15494 case Intrinsic::aarch64_sme_st1b_horiz:
15495 case Intrinsic::aarch64_sme_st1h_horiz:
15496 case Intrinsic::aarch64_sme_st1w_horiz:
15497 case Intrinsic::aarch64_sme_st1d_horiz:
15498 case Intrinsic::aarch64_sme_st1q_horiz: {
15499 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15500 if (!Idx || Idx->getOpcode() != Instruction::Add)
15501 return false;
15502 Ops.push_back(&II->getOperandUse(3));
15503 return true;
15504 }
15505 case Intrinsic::aarch64_neon_pmull:
15506 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15507 return false;
15508 Ops.push_back(&II->getOperandUse(0));
15509 Ops.push_back(&II->getOperandUse(1));
15510 return true;
15511 case Intrinsic::aarch64_neon_pmull64:
15512 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15513 II->getArgOperand(1)))
15514 return false;
15515 Ops.push_back(&II->getArgOperandUse(0));
15516 Ops.push_back(&II->getArgOperandUse(1));
15517 return true;
15518 case Intrinsic::masked_gather:
15519 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15520 return false;
15521 Ops.push_back(&II->getArgOperandUse(0));
15522 return true;
15523 case Intrinsic::masked_scatter:
15524 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15525 return false;
15526 Ops.push_back(&II->getArgOperandUse(1));
15527 return true;
15528 default:
15529 return false;
15530 }
15531 }
15532
15533 // Sink vscales closer to uses for better isel
15534 switch (I->getOpcode()) {
15535 case Instruction::GetElementPtr:
15536 case Instruction::Add:
15537 case Instruction::Sub:
15538 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15539 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15540 Ops.push_back(&I->getOperandUse(Op));
15541 return true;
15542 }
15543 }
15544 break;
15545 default:
15546 break;
15547 }
15548
15549 if (!I->getType()->isVectorTy())
15550 return false;
15551
15552 switch (I->getOpcode()) {
15553 case Instruction::Sub:
15554 case Instruction::Add: {
15555 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15556 return false;
15557
15558 // If the exts' operands extract either the lower or upper elements, we
15559 // can sink them too.
15560 auto Ext1 = cast<Instruction>(I->getOperand(0));
15561 auto Ext2 = cast<Instruction>(I->getOperand(1));
15562 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15563 Ops.push_back(&Ext1->getOperandUse(0));
15564 Ops.push_back(&Ext2->getOperandUse(0));
15565 }
15566
15567 Ops.push_back(&I->getOperandUse(0));
15568 Ops.push_back(&I->getOperandUse(1));
15569
15570 return true;
15571 }
15572 case Instruction::Or: {
15573 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15574 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15575 if (Subtarget->hasNEON()) {
15576 Instruction *OtherAnd, *IA, *IB;
15577 Value *MaskValue;
15578 // MainAnd refers to And instruction that has 'Not' as one of its operands
15579 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15580 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15581 m_Instruction(IA)))))) {
15582 if (match(OtherAnd,
15583 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15584 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15585 ? cast<Instruction>(I->getOperand(1))
15586 : cast<Instruction>(I->getOperand(0));
15587
15588 // Both Ands should be in same basic block as Or
15589 if (I->getParent() != MainAnd->getParent() ||
15590 I->getParent() != OtherAnd->getParent())
15591 return false;
15592
15593 // Non-mask operands of both Ands should also be in same basic block
15594 if (I->getParent() != IA->getParent() ||
15595 I->getParent() != IB->getParent())
15596 return false;
15597
15598 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15599 Ops.push_back(&I->getOperandUse(0));
15600 Ops.push_back(&I->getOperandUse(1));
15601
15602 return true;
15603 }
15604 }
15605 }
15606
15607 return false;
15608 }
15609 case Instruction::Mul: {
15610 int NumZExts = 0, NumSExts = 0;
15611 for (auto &Op : I->operands()) {
15612 // Make sure we are not already sinking this operand
15613 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15614 continue;
15615
15616 if (match(&Op, m_SExt(m_Value()))) {
15617 NumSExts++;
15618 continue;
15619 } else if (match(&Op, m_ZExt(m_Value()))) {
15620 NumZExts++;
15621 continue;
15622 }
15623
15624 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15625
15626 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15627 // operand and the s/zext can help create indexed s/umull. This is
15628 // especially useful to prevent i64 mul being scalarized.
15629 if (Shuffle && isSplatShuffle(Shuffle) &&
15630 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15631 Ops.push_back(&Shuffle->getOperandUse(0));
15632 Ops.push_back(&Op);
15633 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15634 NumSExts++;
15635 else
15636 NumZExts++;
15637 continue;
15638 }
15639
15640 if (!Shuffle)
15641 continue;
15642
15643 Value *ShuffleOperand = Shuffle->getOperand(0);
15644 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15645 if (!Insert)
15646 continue;
15647
15648 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15649 if (!OperandInstr)
15650 continue;
15651
15652 ConstantInt *ElementConstant =
15653 dyn_cast<ConstantInt>(Insert->getOperand(2));
15654 // Check that the insertelement is inserting into element 0
15655 if (!ElementConstant || !ElementConstant->isZero())
15656 continue;
15657
15658 unsigned Opcode = OperandInstr->getOpcode();
15659 if (Opcode == Instruction::SExt)
15660 NumSExts++;
15661 else if (Opcode == Instruction::ZExt)
15662 NumZExts++;
15663 else {
15664 // If we find that the top bits are known 0, then we can sink and allow
15665 // the backend to generate a umull.
15666 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15667 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15668 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15669 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15670 continue;
15671 NumZExts++;
15672 }
15673
15674 Ops.push_back(&Shuffle->getOperandUse(0));
15675 Ops.push_back(&Op);
15676 }
15677
15678 // Is it profitable to sink if we found two of the same type of extends.
15679 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15680 }
15681 default:
15682 return false;
15683 }
15684 return false;
15685}
15686
15688 bool IsLittleEndian) {
15689 Value *Op = ZExt->getOperand(0);
15690 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15691 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15692 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15693 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15694 return false;
15695
15696 assert(DstWidth % SrcWidth == 0 &&
15697 "TBL lowering is not supported for a ZExt instruction with this "
15698 "source & destination element type.");
15699 unsigned ZExtFactor = DstWidth / SrcWidth;
15700 unsigned NumElts = SrcTy->getNumElements();
15701 IRBuilder<> Builder(ZExt);
15702 SmallVector<int> Mask;
15703 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15704 // vector to replace the original ZExt. This can later be lowered to a set of
15705 // tbl instructions.
15706 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15707 if (IsLittleEndian) {
15708 if (i % ZExtFactor == 0)
15709 Mask.push_back(i / ZExtFactor);
15710 else
15711 Mask.push_back(NumElts);
15712 } else {
15713 if ((i + 1) % ZExtFactor == 0)
15714 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15715 else
15716 Mask.push_back(NumElts);
15717 }
15718 }
15719
15720 auto *FirstEltZero = Builder.CreateInsertElement(
15721 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15722 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15723 Result = Builder.CreateBitCast(Result, DstTy);
15724 if (DstTy != ZExt->getType())
15725 Result = Builder.CreateZExt(Result, ZExt->getType());
15726 ZExt->replaceAllUsesWith(Result);
15727 ZExt->eraseFromParent();
15728 return true;
15729}
15730
15731static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15732 IRBuilder<> Builder(TI);
15734 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15735 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15736 auto *DstTy = cast<FixedVectorType>(TI->getType());
15737 assert(SrcTy->getElementType()->isIntegerTy() &&
15738 "Non-integer type source vector element is not supported");
15739 assert(DstTy->getElementType()->isIntegerTy(8) &&
15740 "Unsupported destination vector element type");
15741 unsigned SrcElemTySz =
15742 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15743 unsigned DstElemTySz =
15744 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15745 assert((SrcElemTySz % DstElemTySz == 0) &&
15746 "Cannot lower truncate to tbl instructions for a source element size "
15747 "that is not divisible by the destination element size");
15748 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15749 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15750 "Unsupported source vector element type size");
15751 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15752
15753 // Create a mask to choose every nth byte from the source vector table of
15754 // bytes to create the truncated destination vector, where 'n' is the truncate
15755 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15756 // 0,8,16,..Y*8th bytes for the little-endian format
15758 for (int Itr = 0; Itr < 16; Itr++) {
15759 if (Itr < NumElements)
15760 MaskConst.push_back(Builder.getInt8(
15761 IsLittleEndian ? Itr * TruncFactor
15762 : Itr * TruncFactor + (TruncFactor - 1)));
15763 else
15764 MaskConst.push_back(Builder.getInt8(255));
15765 }
15766
15767 int MaxTblSz = 128 * 4;
15768 int MaxSrcSz = SrcElemTySz * NumElements;
15769 int ElemsPerTbl =
15770 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15771 assert(ElemsPerTbl <= 16 &&
15772 "Maximum elements selected using TBL instruction cannot exceed 16!");
15773
15774 int ShuffleCount = 128 / SrcElemTySz;
15775 SmallVector<int> ShuffleLanes;
15776 for (int i = 0; i < ShuffleCount; ++i)
15777 ShuffleLanes.push_back(i);
15778
15779 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15780 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15781 // call TBL & save the result in a vector of TBL results for combining later.
15783 while (ShuffleLanes.back() < NumElements) {
15784 Parts.push_back(Builder.CreateBitCast(
15785 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15786
15787 if (Parts.size() == 4) {
15789 Intrinsic::aarch64_neon_tbl4, VecTy);
15790 Parts.push_back(ConstantVector::get(MaskConst));
15791 Results.push_back(Builder.CreateCall(F, Parts));
15792 Parts.clear();
15793 }
15794
15795 for (int i = 0; i < ShuffleCount; ++i)
15796 ShuffleLanes[i] += ShuffleCount;
15797 }
15798
15799 assert((Parts.empty() || Results.empty()) &&
15800 "Lowering trunc for vectors requiring different TBL instructions is "
15801 "not supported!");
15802 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15803 // registers
15804 if (!Parts.empty()) {
15805 Intrinsic::ID TblID;
15806 switch (Parts.size()) {
15807 case 1:
15808 TblID = Intrinsic::aarch64_neon_tbl1;
15809 break;
15810 case 2:
15811 TblID = Intrinsic::aarch64_neon_tbl2;
15812 break;
15813 case 3:
15814 TblID = Intrinsic::aarch64_neon_tbl3;
15815 break;
15816 }
15817
15818 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15819 Parts.push_back(ConstantVector::get(MaskConst));
15820 Results.push_back(Builder.CreateCall(F, Parts));
15821 }
15822
15823 // Extract the destination vector from TBL result(s) after combining them
15824 // where applicable. Currently, at most two TBLs are supported.
15825 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15826 "more than 2 tbl instructions!");
15827 Value *FinalResult = Results[0];
15828 if (Results.size() == 1) {
15829 if (ElemsPerTbl < 16) {
15830 SmallVector<int> FinalMask(ElemsPerTbl);
15831 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15832 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15833 }
15834 } else {
15835 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15836 if (ElemsPerTbl < 16) {
15837 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15838 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15839 } else {
15840 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15841 }
15842 FinalResult =
15843 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15844 }
15845
15846 TI->replaceAllUsesWith(FinalResult);
15847 TI->eraseFromParent();
15848}
15849
15851 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15852 // shuffle_vector instructions are serialized when targeting SVE,
15853 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15854 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15855 return false;
15856
15857 // Try to optimize conversions using tbl. This requires materializing constant
15858 // index vectors, which can increase code size and add loads. Skip the
15859 // transform unless the conversion is in a loop block guaranteed to execute
15860 // and we are not optimizing for size.
15861 Function *F = I->getParent()->getParent();
15862 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15863 F->hasOptSize())
15864 return false;
15865
15866 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15867 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15868 if (!SrcTy || !DstTy)
15869 return false;
15870
15871 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15872 // lowered to tbl instructions to insert the original i8 elements
15873 // into i8x lanes. This is enabled for cases where it is beneficial.
15874 auto *ZExt = dyn_cast<ZExtInst>(I);
15875 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15876 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15877 if (DstWidth % 8 != 0)
15878 return false;
15879
15880 auto *TruncDstType =
15881 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15882 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15883 // the remaining ZExt folded into the user, don't use tbl lowering.
15884 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15885 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15888 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15889 return false;
15890
15891 DstTy = TruncDstType;
15892 }
15893
15894 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15895 }
15896
15897 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15898 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15899 DstTy->getElementType()->isFloatTy()) {
15900 IRBuilder<> Builder(I);
15901 auto *ZExt = cast<ZExtInst>(
15902 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15903 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15904 I->replaceAllUsesWith(UI);
15905 I->eraseFromParent();
15906 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15907 Subtarget->isLittleEndian());
15908 }
15909
15910 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15911 // followed by a truncate lowered to using tbl.4.
15912 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15913 if (FPToUI &&
15914 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15915 SrcTy->getElementType()->isFloatTy() &&
15916 DstTy->getElementType()->isIntegerTy(8)) {
15917 IRBuilder<> Builder(I);
15918 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15919 VectorType::getInteger(SrcTy));
15920 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15921 I->replaceAllUsesWith(TruncI);
15922 I->eraseFromParent();
15923 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15924 return true;
15925 }
15926
15927 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15928 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15929 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15930 // registers
15931 auto *TI = dyn_cast<TruncInst>(I);
15932 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15933 ((SrcTy->getElementType()->isIntegerTy(32) ||
15934 SrcTy->getElementType()->isIntegerTy(64)) &&
15935 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15936 createTblForTrunc(TI, Subtarget->isLittleEndian());
15937 return true;
15938 }
15939
15940 return false;
15941}
15942
15944 Align &RequiredAligment) const {
15945 if (!LoadedType.isSimple() ||
15946 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15947 return false;
15948 // Cyclone supports unaligned accesses.
15949 RequiredAligment = Align(1);
15950 unsigned NumBits = LoadedType.getSizeInBits();
15951 return NumBits == 32 || NumBits == 64;
15952}
15953
15954/// A helper function for determining the number of interleaved accesses we
15955/// will generate when lowering accesses of the given type.
15957 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15958 unsigned VecSize = 128;
15959 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15960 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15961 if (UseScalable && isa<FixedVectorType>(VecTy))
15962 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15963 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15964}
15965
15968 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15969 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15970 return MOStridedAccess;
15972}
15973
15975 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15976 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15977 auto EC = VecTy->getElementCount();
15978 unsigned MinElts = EC.getKnownMinValue();
15979
15980 UseScalable = false;
15981
15982 if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
15983 !Subtarget->useSVEForFixedLengthVectors())
15984 return false;
15985
15986 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15987 return false;
15988
15989 // Ensure that the predicate for this number of elements is available.
15990 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15991 return false;
15992
15993 // Ensure the number of vector elements is greater than 1.
15994 if (MinElts < 2)
15995 return false;
15996
15997 // Ensure the element type is legal.
15998 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15999 return false;
16000
16001 if (EC.isScalable()) {
16002 UseScalable = true;
16003 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
16004 }
16005
16006 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16007 if (Subtarget->useSVEForFixedLengthVectors()) {
16008 unsigned MinSVEVectorSize =
16009 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16010 if (VecSize % MinSVEVectorSize == 0 ||
16011 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
16012 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
16013 UseScalable = true;
16014 return true;
16015 }
16016 }
16017
16018 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16019 // 128 will be split into multiple interleaved accesses.
16020 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
16021}
16022
16024 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16025 return ScalableVectorType::get(VTy->getElementType(), 2);
16026
16027 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16028 return ScalableVectorType::get(VTy->getElementType(), 4);
16029
16030 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16031 return ScalableVectorType::get(VTy->getElementType(), 8);
16032
16033 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16034 return ScalableVectorType::get(VTy->getElementType(), 8);
16035
16036 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16037 return ScalableVectorType::get(VTy->getElementType(), 2);
16038
16039 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16040 return ScalableVectorType::get(VTy->getElementType(), 4);
16041
16042 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16043 return ScalableVectorType::get(VTy->getElementType(), 8);
16044
16045 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16046 return ScalableVectorType::get(VTy->getElementType(), 16);
16047
16048 llvm_unreachable("Cannot handle input vector type");
16049}
16050
16051static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16052 bool Scalable, Type *LDVTy,
16053 Type *PtrTy) {
16054 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16055 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16056 Intrinsic::aarch64_sve_ld3_sret,
16057 Intrinsic::aarch64_sve_ld4_sret};
16058 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16059 Intrinsic::aarch64_neon_ld3,
16060 Intrinsic::aarch64_neon_ld4};
16061 if (Scalable)
16062 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16063
16064 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16065}
16066
16067static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16068 bool Scalable, Type *STVTy,
16069 Type *PtrTy) {
16070 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16071 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16072 Intrinsic::aarch64_sve_st3,
16073 Intrinsic::aarch64_sve_st4};
16074 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16075 Intrinsic::aarch64_neon_st3,
16076 Intrinsic::aarch64_neon_st4};
16077 if (Scalable)
16078 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16079
16080 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16081}
16082
16083/// Lower an interleaved load into a ldN intrinsic.
16084///
16085/// E.g. Lower an interleaved load (Factor = 2):
16086/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16087/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16088/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16089///
16090/// Into:
16091/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16092/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16093/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16096 ArrayRef<unsigned> Indices, unsigned Factor) const {
16097 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16098 "Invalid interleave factor");
16099 assert(!Shuffles.empty() && "Empty shufflevector input");
16100 assert(Shuffles.size() == Indices.size() &&
16101 "Unmatched number of shufflevectors and indices");
16102
16103 const DataLayout &DL = LI->getModule()->getDataLayout();
16104
16105 VectorType *VTy = Shuffles[0]->getType();
16106
16107 // Skip if we do not have NEON and skip illegal vector types. We can
16108 // "legalize" wide vector types into multiple interleaved accesses as long as
16109 // the vector types are divisible by 128.
16110 bool UseScalable;
16111 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16112 return false;
16113
16114 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16115
16116 auto *FVTy = cast<FixedVectorType>(VTy);
16117
16118 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16119 // load integer vectors first and then convert to pointer vectors.
16120 Type *EltTy = FVTy->getElementType();
16121 if (EltTy->isPointerTy())
16122 FVTy =
16123 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16124
16125 // If we're going to generate more than one load, reset the sub-vector type
16126 // to something legal.
16127 FVTy = FixedVectorType::get(FVTy->getElementType(),
16128 FVTy->getNumElements() / NumLoads);
16129
16130 auto *LDVTy =
16131 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16132
16133 IRBuilder<> Builder(LI);
16134
16135 // The base address of the load.
16136 Value *BaseAddr = LI->getPointerOperand();
16137
16138 Type *PtrTy = LI->getPointerOperandType();
16139 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16140 LDVTy->getElementCount());
16141
16142 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16143 UseScalable, LDVTy, PtrTy);
16144
16145 // Holds sub-vectors extracted from the load intrinsic return values. The
16146 // sub-vectors are associated with the shufflevector instructions they will
16147 // replace.
16149
16150 Value *PTrue = nullptr;
16151 if (UseScalable) {
16152 std::optional<unsigned> PgPattern =
16153 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16154 if (Subtarget->getMinSVEVectorSizeInBits() ==
16155 Subtarget->getMaxSVEVectorSizeInBits() &&
16156 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16157 PgPattern = AArch64SVEPredPattern::all;
16158
16159 auto *PTruePat =
16160 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16161 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16162 {PTruePat});
16163 }
16164
16165 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16166
16167 // If we're generating more than one load, compute the base address of
16168 // subsequent loads as an offset from the previous.
16169 if (LoadCount > 0)
16170 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16171 FVTy->getNumElements() * Factor);
16172
16173 CallInst *LdN;
16174 if (UseScalable)
16175 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16176 else
16177 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16178
16179 // Extract and store the sub-vectors returned by the load intrinsic.
16180 for (unsigned i = 0; i < Shuffles.size(); i++) {
16181 ShuffleVectorInst *SVI = Shuffles[i];
16182 unsigned Index = Indices[i];
16183
16184 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16185
16186 if (UseScalable)
16187 SubVec = Builder.CreateExtractVector(
16188 FVTy, SubVec,
16189 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16190
16191 // Convert the integer vector to pointer vector if the element is pointer.
16192 if (EltTy->isPointerTy())
16193 SubVec = Builder.CreateIntToPtr(
16195 FVTy->getNumElements()));
16196
16197 SubVecs[SVI].push_back(SubVec);
16198 }
16199 }
16200
16201 // Replace uses of the shufflevector instructions with the sub-vectors
16202 // returned by the load intrinsic. If a shufflevector instruction is
16203 // associated with more than one sub-vector, those sub-vectors will be
16204 // concatenated into a single wide vector.
16205 for (ShuffleVectorInst *SVI : Shuffles) {
16206 auto &SubVec = SubVecs[SVI];
16207 auto *WideVec =
16208 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16209 SVI->replaceAllUsesWith(WideVec);
16210 }
16211
16212 return true;
16213}
16214
16215template <typename Iter>
16216bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16217 int MaxLookupDist = 20;
16218 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16219 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16220 const Value *PtrA1 =
16221 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16222
16223 while (++It != End) {
16224 if (It->isDebugOrPseudoInst())
16225 continue;
16226 if (MaxLookupDist-- == 0)
16227 break;
16228 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16229 const Value *PtrB1 =
16230 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16231 DL, OffsetB);
16232 if (PtrA1 == PtrB1 &&
16233 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16234 .abs() == 16)
16235 return true;
16236 }
16237 }
16238
16239 return false;
16240}
16241
16242/// Lower an interleaved store into a stN intrinsic.
16243///
16244/// E.g. Lower an interleaved store (Factor = 3):
16245/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16246/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16247/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16248///
16249/// Into:
16250/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16251/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16252/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16253/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16254///
16255/// Note that the new shufflevectors will be removed and we'll only generate one
16256/// st3 instruction in CodeGen.
16257///
16258/// Example for a more general valid mask (Factor 3). Lower:
16259/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16260/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16261/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16262///
16263/// Into:
16264/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16265/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16266/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16267/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16269 ShuffleVectorInst *SVI,
16270 unsigned Factor) const {
16271
16272 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16273 "Invalid interleave factor");
16274
16275 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16276 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16277
16278 unsigned LaneLen = VecTy->getNumElements() / Factor;
16279 Type *EltTy = VecTy->getElementType();
16280 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16281
16282 const DataLayout &DL = SI->getModule()->getDataLayout();
16283 bool UseScalable;
16284
16285 // Skip if we do not have NEON and skip illegal vector types. We can
16286 // "legalize" wide vector types into multiple interleaved accesses as long as
16287 // the vector types are divisible by 128.
16288 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16289 return false;
16290
16291 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16292
16293 Value *Op0 = SVI->getOperand(0);
16294 Value *Op1 = SVI->getOperand(1);
16295 IRBuilder<> Builder(SI);
16296
16297 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16298 // vectors to integer vectors.
16299 if (EltTy->isPointerTy()) {
16300 Type *IntTy = DL.getIntPtrType(EltTy);
16301 unsigned NumOpElts =
16302 cast<FixedVectorType>(Op0->getType())->getNumElements();
16303
16304 // Convert to the corresponding integer vector.
16305 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16306 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16307 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16308
16309 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16310 }
16311
16312 // If we're going to generate more than one store, reset the lane length
16313 // and sub-vector type to something legal.
16314 LaneLen /= NumStores;
16315 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16316
16317 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16318 : SubVecTy;
16319
16320 // The base address of the store.
16321 Value *BaseAddr = SI->getPointerOperand();
16322
16323 auto Mask = SVI->getShuffleMask();
16324
16325 // Sanity check if all the indices are NOT in range.
16326 // If mask is `poison`, `Mask` may be a vector of -1s.
16327 // If all of them are `poison`, OOB read will happen later.
16328 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16329 return false;
16330 }
16331 // A 64bit st2 which does not start at element 0 will involved adding extra
16332 // ext elements making the st2 unprofitable, and if there is a nearby store
16333 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16334 // zip;ldp pair which has higher throughput.
16335 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16336 (Mask[0] != 0 ||
16337 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16338 DL) ||
16339 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16340 BaseAddr, DL)))
16341 return false;
16342
16343 Type *PtrTy = SI->getPointerOperandType();
16344 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16345 STVTy->getElementCount());
16346
16347 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16348 UseScalable, STVTy, PtrTy);
16349
16350 Value *PTrue = nullptr;
16351 if (UseScalable) {
16352 std::optional<unsigned> PgPattern =
16353 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16354 if (Subtarget->getMinSVEVectorSizeInBits() ==
16355 Subtarget->getMaxSVEVectorSizeInBits() &&
16356 Subtarget->getMinSVEVectorSizeInBits() ==
16357 DL.getTypeSizeInBits(SubVecTy))
16358 PgPattern = AArch64SVEPredPattern::all;
16359
16360 auto *PTruePat =
16361 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16362 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16363 {PTruePat});
16364 }
16365
16366 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16367
16369
16370 // Split the shufflevector operands into sub vectors for the new stN call.
16371 for (unsigned i = 0; i < Factor; i++) {
16372 Value *Shuffle;
16373 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16374 if (Mask[IdxI] >= 0) {
16375 Shuffle = Builder.CreateShuffleVector(
16376 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16377 } else {
16378 unsigned StartMask = 0;
16379 for (unsigned j = 1; j < LaneLen; j++) {
16380 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16381 if (Mask[IdxJ] >= 0) {
16382 StartMask = Mask[IdxJ] - j;
16383 break;
16384 }
16385 }
16386 // Note: Filling undef gaps with random elements is ok, since
16387 // those elements were being written anyway (with undefs).
16388 // In the case of all undefs we're defaulting to using elems from 0
16389 // Note: StartMask cannot be negative, it's checked in
16390 // isReInterleaveMask
16391 Shuffle = Builder.CreateShuffleVector(
16392 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16393 }
16394
16395 if (UseScalable)
16396 Shuffle = Builder.CreateInsertVector(
16397 STVTy, UndefValue::get(STVTy), Shuffle,
16398 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16399
16400 Ops.push_back(Shuffle);
16401 }
16402
16403 if (UseScalable)
16404 Ops.push_back(PTrue);
16405
16406 // If we generating more than one store, we compute the base address of
16407 // subsequent stores as an offset from the previous.
16408 if (StoreCount > 0)
16409 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16410 BaseAddr, LaneLen * Factor);
16411
16412 Ops.push_back(BaseAddr);
16413 Builder.CreateCall(StNFunc, Ops);
16414 }
16415 return true;
16416}
16417
16419 IntrinsicInst *DI, LoadInst *LI) const {
16420 // Only deinterleave2 supported at present.
16421 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
16422 return false;
16423
16424 // Only a factor of 2 supported at present.
16425 const unsigned Factor = 2;
16426
16427 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16428 const DataLayout &DL = DI->getModule()->getDataLayout();
16429 bool UseScalable;
16430 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16431 return false;
16432
16433 // TODO: Add support for using SVE instructions with fixed types later, using
16434 // the code from lowerInterleavedLoad to obtain the correct container type.
16435 if (UseScalable && !VTy->isScalableTy())
16436 return false;
16437
16438 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16439
16440 VectorType *LdTy =
16442 VTy->getElementCount().divideCoefficientBy(NumLoads));
16443
16444 Type *PtrTy = LI->getPointerOperandType();
16445 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16446 UseScalable, LdTy, PtrTy);
16447
16448 IRBuilder<> Builder(LI);
16449
16450 Value *Pred = nullptr;
16451 if (UseScalable)
16452 Pred =
16453 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16454
16455 Value *BaseAddr = LI->getPointerOperand();
16456 Value *Result;
16457 if (NumLoads > 1) {
16458 Value *Left = PoisonValue::get(VTy);
16460
16461 for (unsigned I = 0; I < NumLoads; ++I) {
16462 Value *Offset = Builder.getInt64(I * Factor);
16463
16464 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16465 Value *LdN = nullptr;
16466 if (UseScalable)
16467 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16468 else
16469 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16470
16471 Value *Idx =
16472 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16473 Left = Builder.CreateInsertVector(
16474 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16475 Right = Builder.CreateInsertVector(
16476 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16477 }
16478
16479 Result = PoisonValue::get(DI->getType());
16480 Result = Builder.CreateInsertValue(Result, Left, 0);
16481 Result = Builder.CreateInsertValue(Result, Right, 1);
16482 } else {
16483 if (UseScalable)
16484 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16485 else
16486 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16487 }
16488
16489 DI->replaceAllUsesWith(Result);
16490 return true;
16491}
16492
16494 IntrinsicInst *II, StoreInst *SI) const {
16495 // Only interleave2 supported at present.
16496 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
16497 return false;
16498
16499 // Only a factor of 2 supported at present.
16500 const unsigned Factor = 2;
16501
16502 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16503 const DataLayout &DL = II->getModule()->getDataLayout();
16504 bool UseScalable;
16505 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16506 return false;
16507
16508 // TODO: Add support for using SVE instructions with fixed types later, using
16509 // the code from lowerInterleavedStore to obtain the correct container type.
16510 if (UseScalable && !VTy->isScalableTy())
16511 return false;
16512
16513 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16514
16515 VectorType *StTy =
16517 VTy->getElementCount().divideCoefficientBy(NumStores));
16518
16519 Type *PtrTy = SI->getPointerOperandType();
16520 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16521 UseScalable, StTy, PtrTy);
16522
16523 IRBuilder<> Builder(SI);
16524
16525 Value *BaseAddr = SI->getPointerOperand();
16526 Value *Pred = nullptr;
16527
16528 if (UseScalable)
16529 Pred =
16530 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16531
16532 Value *L = II->getOperand(0);
16533 Value *R = II->getOperand(1);
16534
16535 for (unsigned I = 0; I < NumStores; ++I) {
16536 Value *Address = BaseAddr;
16537 if (NumStores > 1) {
16538 Value *Offset = Builder.getInt64(I * Factor);
16539 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16540
16541 Value *Idx =
16542 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16543 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16544 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16545 }
16546
16547 if (UseScalable)
16548 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16549 else
16550 Builder.CreateCall(StNFunc, {L, R, Address});
16551 }
16552
16553 return true;
16554}
16555
16557 const MemOp &Op, const AttributeList &FuncAttributes) const {
16558 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16559 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16560 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16561 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16562 // taken one instruction to materialize the v2i64 zero and one store (with
16563 // restrictive addressing mode). Just do i64 stores.
16564 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16565 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16566 if (Op.isAligned(AlignCheck))
16567 return true;
16568 unsigned Fast;
16569 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16571 Fast;
16572 };
16573
16574 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16575 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16576 return MVT::v16i8;
16577 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16578 return MVT::f128;
16579 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16580 return MVT::i64;
16581 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16582 return MVT::i32;
16583 return MVT::Other;
16584}
16585
16587 const MemOp &Op, const AttributeList &FuncAttributes) const {
16588 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16589 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16590 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16591 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16592 // taken one instruction to materialize the v2i64 zero and one store (with
16593 // restrictive addressing mode). Just do i64 stores.
16594 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16595 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16596 if (Op.isAligned(AlignCheck))
16597 return true;
16598 unsigned Fast;
16599 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16601 Fast;
16602 };
16603
16604 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16605 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16606 return LLT::fixed_vector(2, 64);
16607 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16608 return LLT::scalar(128);
16609 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16610 return LLT::scalar(64);
16611 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16612 return LLT::scalar(32);
16613 return LLT();
16614}
16615
16616// 12-bit optionally shifted immediates are legal for adds.
16618 if (Immed == std::numeric_limits<int64_t>::min()) {
16619 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16620 << ": avoid UB for INT64_MIN\n");
16621 return false;
16622 }
16623 // Same encoding for add/sub, just flip the sign.
16624 Immed = std::abs(Immed);
16625 bool IsLegal = ((Immed >> 12) == 0 ||
16626 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16627 LLVM_DEBUG(dbgs() << "Is " << Immed
16628 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16629 return IsLegal;
16630}
16631
16633 // We will only emit addvl/inc* instructions for SVE2
16634 if (!Subtarget->hasSVE2())
16635 return false;
16636
16637 // addvl's immediates are in terms of the number of bytes in a register.
16638 // Since there are 16 in the base supported size (128bits), we need to
16639 // divide the immediate by that much to give us a useful immediate to
16640 // multiply by vscale. We can't have a remainder as a result of this.
16641 if (Imm % 16 == 0)
16642 return isInt<6>(Imm / 16);
16643
16644 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16645 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16646 // of addvl as a result, so only take h|w|d into account.
16647 // Dec[h|w|d] will cover subtractions.
16648 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16649 // FIXME: Can we make use of other patterns to cover other immediates?
16650
16651 // inch|dech
16652 if (Imm % 8 == 0)
16653 return std::abs(Imm / 8) <= 16;
16654 // incw|decw
16655 if (Imm % 4 == 0)
16656 return std::abs(Imm / 4) <= 16;
16657 // incd|decd
16658 if (Imm % 2 == 0)
16659 return std::abs(Imm / 2) <= 16;
16660
16661 return false;
16662}
16663
16664// Return false to prevent folding
16665// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16666// if the folding leads to worse code.
16668 SDValue AddNode, SDValue ConstNode) const {
16669 // Let the DAGCombiner decide for vector types and large types.
16670 const EVT VT = AddNode.getValueType();
16671 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16672 return true;
16673
16674 // It is worse if c1 is legal add immediate, while c1*c2 is not
16675 // and has to be composed by at least two instructions.
16676 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16677 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16678 const int64_t C1 = C1Node->getSExtValue();
16679 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16681 return true;
16683 // Adapt to the width of a register.
16684 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16686 if (Insn.size() > 1)
16687 return false;
16688
16689 // Default to true and let the DAGCombiner decide.
16690 return true;
16691}
16692
16693// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16694// immediates is the same as for an add or a sub.
16696 return isLegalAddImmediate(Immed);
16697}
16698
16699/// isLegalAddressingMode - Return true if the addressing mode represented
16700/// by AM is legal for this target, for a load/store of the specified type.
16702 const AddrMode &AMode, Type *Ty,
16703 unsigned AS, Instruction *I) const {
16704 // AArch64 has five basic addressing modes:
16705 // reg
16706 // reg + 9-bit signed offset
16707 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16708 // reg1 + reg2
16709 // reg + SIZE_IN_BYTES * reg
16710
16711 // No global is ever allowed as a base.
16712 if (AMode.BaseGV)
16713 return false;
16714
16715 // No reg+reg+imm addressing.
16716 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16717 return false;
16718
16719 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16720 // `2*ScaledReg` into `BaseReg + ScaledReg`
16721 AddrMode AM = AMode;
16722 if (AM.Scale && !AM.HasBaseReg) {
16723 if (AM.Scale == 1) {
16724 AM.HasBaseReg = true;
16725 AM.Scale = 0;
16726 } else if (AM.Scale == 2) {
16727 AM.HasBaseReg = true;
16728 AM.Scale = 1;
16729 } else {
16730 return false;
16731 }
16732 }
16733
16734 // A base register is required in all addressing modes.
16735 if (!AM.HasBaseReg)
16736 return false;
16737
16738 if (Ty->isScalableTy()) {
16739 if (isa<ScalableVectorType>(Ty)) {
16740 // See if we have a foldable vscale-based offset, for vector types which
16741 // are either legal or smaller than the minimum; more work will be
16742 // required if we need to consider addressing for types which need
16743 // legalization by splitting.
16744 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16745 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16746 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16747 isPowerOf2_64(VecNumBytes))
16748 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
16749
16750 uint64_t VecElemNumBytes =
16751 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16752 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16753 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16754 }
16755
16756 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16757 }
16758
16759 // No scalable offsets allowed for non-scalable types.
16760 if (AM.ScalableOffset)
16761 return false;
16762
16763 // check reg + imm case:
16764 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16765 uint64_t NumBytes = 0;
16766 if (Ty->isSized()) {
16767 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16768 NumBytes = NumBits / 8;
16769 if (!isPowerOf2_64(NumBits))
16770 NumBytes = 0;
16771 }
16772
16773 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16774 AM.Scale);
16775}
16776
16777// Check whether the 2 offsets belong to the same imm24 range, and their high
16778// 12bits are same, then their high part can be decoded with the offset of add.
16779int64_t
16781 int64_t MaxOffset) const {
16782 int64_t HighPart = MinOffset & ~0xfffULL;
16783 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16784 // Rebase the value to an integer multiple of imm12.
16785 return HighPart;
16786 }
16787
16788 return 0;
16789}
16790
16792 // Consider splitting large offset of struct or array.
16793 return true;
16794}
16795
16797 const MachineFunction &MF, EVT VT) const {
16798 VT = VT.getScalarType();
16799
16800 if (!VT.isSimple())
16801 return false;
16802
16803 switch (VT.getSimpleVT().SimpleTy) {
16804 case MVT::f16:
16805 return Subtarget->hasFullFP16();
16806 case MVT::f32:
16807 case MVT::f64:
16808 return true;
16809 default:
16810 break;
16811 }
16812
16813 return false;
16814}
16815
16817 Type *Ty) const {
16818 switch (Ty->getScalarType()->getTypeID()) {
16819 case Type::FloatTyID:
16820 case Type::DoubleTyID:
16821 return true;
16822 default:
16823 return false;
16824 }
16825}
16826
16828 EVT VT, CodeGenOptLevel OptLevel) const {
16829 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16831}
16832
16833const MCPhysReg *
16835 // LR is a callee-save register, but we must treat it as clobbered by any call
16836 // site. Hence we include LR in the scratch registers, which are in turn added
16837 // as implicit-defs for stackmaps and patchpoints.
16838 static const MCPhysReg ScratchRegs[] = {
16839 AArch64::X16, AArch64::X17, AArch64::LR, 0
16840 };
16841 return ScratchRegs;
16842}
16843
16845 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16846 return RCRegs;
16847}
16848
16849bool
16851 CombineLevel Level) const {
16852 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16853 N->getOpcode() == ISD::SRL) &&
16854 "Expected shift op");
16855
16856 SDValue ShiftLHS = N->getOperand(0);
16857 EVT VT = N->getValueType(0);
16858
16859 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16860 // combine it with shift 'N' to let it be lowered to UBFX except:
16861 // ((x >> C) & mask) << C.
16862 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16863 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16864 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16865 if (isMask_64(TruncMask)) {
16866 SDValue AndLHS = ShiftLHS.getOperand(0);
16867 if (AndLHS.getOpcode() == ISD::SRL) {
16868 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16869 if (N->getOpcode() == ISD::SHL)
16870 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16871 return SRLC->getZExtValue() == SHLC->getZExtValue();
16872 return false;
16873 }
16874 }
16875 }
16876 }
16877 return true;
16878}
16879
16881 const SDNode *N) const {
16882 assert(N->getOpcode() == ISD::XOR &&
16883 (N->getOperand(0).getOpcode() == ISD::SHL ||
16884 N->getOperand(0).getOpcode() == ISD::SRL) &&
16885 "Expected XOR(SHIFT) pattern");
16886
16887 // Only commute if the entire NOT mask is a hidden shifted mask.
16888 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16889 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16890 if (XorC && ShiftC) {
16891 unsigned MaskIdx, MaskLen;
16892 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16893 unsigned ShiftAmt = ShiftC->getZExtValue();
16894 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16895 if (N->getOperand(0).getOpcode() == ISD::SHL)
16896 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16897 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16898 }
16899 }
16900
16901 return false;
16902}
16903
16905 const SDNode *N, CombineLevel Level) const {
16906 assert(((N->getOpcode() == ISD::SHL &&
16907 N->getOperand(0).getOpcode() == ISD::SRL) ||
16908 (N->getOpcode() == ISD::SRL &&
16909 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16910 "Expected shift-shift mask");
16911 // Don't allow multiuse shift folding with the same shift amount.
16912 if (!N->getOperand(0)->hasOneUse())
16913 return false;
16914
16915 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16916 EVT VT = N->getValueType(0);
16917 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16918 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16919 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16920 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16921 }
16922
16923 return true;
16924}
16925
16927 unsigned BinOpcode, EVT VT) const {
16928 return VT.isScalableVector() && isTypeLegal(VT);
16929}
16930
16932 Type *Ty) const {
16933 assert(Ty->isIntegerTy());
16934
16935 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16936 if (BitSize == 0)
16937 return false;
16938
16939 int64_t Val = Imm.getSExtValue();
16940 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16941 return true;
16942
16943 if ((int64_t)Val < 0)
16944 Val = ~Val;
16945 if (BitSize == 32)
16946 Val &= (1LL << 32) - 1;
16947
16948 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16949 // MOVZ is free so return true for one or fewer MOVK.
16950 return Shift < 3;
16951}
16952
16954 unsigned Index) const {
16956 return false;
16957
16958 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16959}
16960
16961/// Turn vector tests of the signbit in the form of:
16962/// xor (sra X, elt_size(X)-1), -1
16963/// into:
16964/// cmge X, X, #0
16966 const AArch64Subtarget *Subtarget) {
16967 EVT VT = N->getValueType(0);
16968 if (!Subtarget->hasNEON() || !VT.isVector())
16969 return SDValue();
16970
16971 // There must be a shift right algebraic before the xor, and the xor must be a
16972 // 'not' operation.
16973 SDValue Shift = N->getOperand(0);
16974 SDValue Ones = N->getOperand(1);
16975 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16977 return SDValue();
16978
16979 // The shift should be smearing the sign bit across each vector element.
16980 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16981 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16982 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16983 return SDValue();
16984
16985 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16986}
16987
16988// Given a vecreduce_add node, detect the below pattern and convert it to the
16989// node sequence with UABDL, [S|U]ADB and UADDLP.
16990//
16991// i32 vecreduce_add(
16992// v16i32 abs(
16993// v16i32 sub(
16994// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16995// =================>
16996// i32 vecreduce_add(
16997// v4i32 UADDLP(
16998// v8i16 add(
16999// v8i16 zext(
17000// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
17001// v8i16 zext(
17002// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17004 SelectionDAG &DAG) {
17005 // Assumed i32 vecreduce_add
17006 if (N->getValueType(0) != MVT::i32)
17007 return SDValue();
17008
17009 SDValue VecReduceOp0 = N->getOperand(0);
17010 unsigned Opcode = VecReduceOp0.getOpcode();
17011 // Assumed v16i32 abs
17012 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
17013 return SDValue();
17014
17015 SDValue ABS = VecReduceOp0;
17016 // Assumed v16i32 sub
17017 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
17018 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
17019 return SDValue();
17020
17021 SDValue SUB = ABS->getOperand(0);
17022 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17023 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17024 // Assumed v16i32 type
17025 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17026 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17027 return SDValue();
17028
17029 // Assumed zext or sext
17030 bool IsZExt = false;
17031 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17032 IsZExt = true;
17033 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17034 IsZExt = false;
17035 } else
17036 return SDValue();
17037
17038 SDValue EXT0 = SUB->getOperand(0);
17039 SDValue EXT1 = SUB->getOperand(1);
17040 // Assumed zext's operand has v16i8 type
17041 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17042 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17043 return SDValue();
17044
17045 // Pattern is dectected. Let's convert it to sequence of nodes.
17046 SDLoc DL(N);
17047
17048 // First, create the node pattern of UABD/SABD.
17049 SDValue UABDHigh8Op0 =
17050 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17051 DAG.getConstant(8, DL, MVT::i64));
17052 SDValue UABDHigh8Op1 =
17053 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17054 DAG.getConstant(8, DL, MVT::i64));
17055 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17056 UABDHigh8Op0, UABDHigh8Op1);
17057 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17058
17059 // Second, create the node pattern of UABAL.
17060 SDValue UABDLo8Op0 =
17061 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17062 DAG.getConstant(0, DL, MVT::i64));
17063 SDValue UABDLo8Op1 =
17064 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17065 DAG.getConstant(0, DL, MVT::i64));
17066 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17067 UABDLo8Op0, UABDLo8Op1);
17068 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17069 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17070
17071 // Third, create the node of UADDLP.
17072 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17073
17074 // Fourth, create the node of VECREDUCE_ADD.
17075 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17076}
17077
17078// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17079// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17080// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17081// If we have vectors larger than v16i8 we extract v16i8 vectors,
17082// Follow the same steps above to get DOT instructions concatenate them
17083// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17085 const AArch64Subtarget *ST) {
17086 if (!ST->hasDotProd())
17088
17089 SDValue Op0 = N->getOperand(0);
17090 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17091 Op0.getValueType().getVectorElementType() != MVT::i32)
17092 return SDValue();
17093
17094 unsigned ExtOpcode = Op0.getOpcode();
17095 SDValue A = Op0;
17096 SDValue B;
17097 if (ExtOpcode == ISD::MUL) {
17098 A = Op0.getOperand(0);
17099 B = Op0.getOperand(1);
17100 if (A.getOpcode() != B.getOpcode() ||
17101 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17102 return SDValue();
17103 ExtOpcode = A.getOpcode();
17104 }
17105 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17106 return SDValue();
17107
17108 EVT Op0VT = A.getOperand(0).getValueType();
17109 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17110 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17111 if (!IsValidElementCount || !IsValidSize)
17112 return SDValue();
17113
17114 SDLoc DL(Op0);
17115 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17116 // the extend B.
17117 if (!B)
17118 B = DAG.getConstant(1, DL, Op0VT);
17119 else
17120 B = B.getOperand(0);
17121
17122 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17123 unsigned NumOfVecReduce;
17124 EVT TargetType;
17125 if (IsMultipleOf16) {
17126 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17127 TargetType = MVT::v4i32;
17128 } else {
17129 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17130 TargetType = MVT::v2i32;
17131 }
17132 auto DotOpcode =
17134 // Handle the case where we need to generate only one Dot operation.
17135 if (NumOfVecReduce == 1) {
17136 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17137 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17138 A.getOperand(0), B);
17139 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17140 }
17141 // Generate Dot instructions that are multiple of 16.
17142 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17143 SmallVector<SDValue, 4> SDotVec16;
17144 unsigned I = 0;
17145 for (; I < VecReduce16Num; I += 1) {
17146 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17147 SDValue Op0 =
17148 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17149 DAG.getConstant(I * 16, DL, MVT::i64));
17150 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17151 DAG.getConstant(I * 16, DL, MVT::i64));
17152 SDValue Dot =
17153 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17154 SDotVec16.push_back(Dot);
17155 }
17156 // Concatenate dot operations.
17157 EVT SDot16EVT =
17158 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17159 SDValue ConcatSDot16 =
17160 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17161 SDValue VecReduceAdd16 =
17162 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17163 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17164 if (VecReduce8Num == 0)
17165 return VecReduceAdd16;
17166
17167 // Generate the remainder Dot operation that is multiple of 8.
17168 SmallVector<SDValue, 4> SDotVec8;
17169 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17170 SDValue Vec8Op0 =
17171 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17172 DAG.getConstant(I * 16, DL, MVT::i64));
17173 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17174 DAG.getConstant(I * 16, DL, MVT::i64));
17175 SDValue Dot =
17176 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17177 SDValue VecReudceAdd8 =
17178 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17179 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17180 VecReudceAdd8);
17181}
17182
17183// Given an (integer) vecreduce, we know the order of the inputs does not
17184// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17185// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17186// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17188 auto DetectAddExtract = [&](SDValue A) {
17189 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17190 // UADDLP(x) if found.
17191 assert(A.getOpcode() == ISD::ADD);
17192 EVT VT = A.getValueType();
17193 SDValue Op0 = A.getOperand(0);
17194 SDValue Op1 = A.getOperand(1);
17195 if (Op0.getOpcode() != Op0.getOpcode() ||
17196 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17197 Op0.getOpcode() != ISD::SIGN_EXTEND))
17198 return SDValue();
17199 SDValue Ext0 = Op0.getOperand(0);
17200 SDValue Ext1 = Op1.getOperand(0);
17201 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17203 Ext0.getOperand(0) != Ext1.getOperand(0))
17204 return SDValue();
17205 // Check that the type is twice the add types, and the extract are from
17206 // upper/lower parts of the same source.
17208 VT.getVectorNumElements() * 2)
17209 return SDValue();
17210 if ((Ext0.getConstantOperandVal(1) != 0 ||
17212 (Ext1.getConstantOperandVal(1) != 0 ||
17214 return SDValue();
17215 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17217 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17218 };
17219
17220 if (SDValue R = DetectAddExtract(A))
17221 return R;
17222
17223 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17224 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17225 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17226 A.getOperand(1));
17227 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17228 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17229 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17230 A.getOperand(0));
17231 return SDValue();
17232}
17233
17234// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17235// UADDLV(concat), where the concat represents the 64-bit zext sources.
17237 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17238 // UADDLV(concat(zext, zext)) if found.
17239 assert(A.getOpcode() == ISD::ADD);
17240 EVT VT = A.getValueType();
17241 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17242 return SDValue();
17243 SDValue Op0 = A.getOperand(0);
17244 SDValue Op1 = A.getOperand(1);
17245 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17246 return SDValue();
17247 SDValue Ext0 = Op0.getOperand(0);
17248 SDValue Ext1 = Op1.getOperand(0);
17249 EVT ExtVT0 = Ext0.getValueType();
17250 EVT ExtVT1 = Ext1.getValueType();
17251 // Check zext VTs are the same and 64-bit length.
17252 if (ExtVT0 != ExtVT1 ||
17253 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17254 return SDValue();
17255 // Get VT for concat of zext sources.
17256 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17257 SDValue Concat =
17258 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17259
17260 switch (VT.getSimpleVT().SimpleTy) {
17261 case MVT::v2i64:
17262 case MVT::v4i32:
17263 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17264 case MVT::v8i16: {
17265 SDValue Uaddlv =
17266 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17267 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17268 }
17269 default:
17270 llvm_unreachable("Unhandled vector type");
17271 }
17272}
17273
17275 SDValue A = N->getOperand(0);
17276 if (A.getOpcode() == ISD::ADD) {
17277 if (SDValue R = performUADDVAddCombine(A, DAG))
17278 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17279 else if (SDValue R = performUADDVZextCombine(A, DAG))
17280 return R;
17281 }
17282 return SDValue();
17283}
17284
17287 const AArch64Subtarget *Subtarget) {
17288 if (DCI.isBeforeLegalizeOps())
17289 return SDValue();
17290
17291 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17292}
17293
17294SDValue
17295AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17296 SelectionDAG &DAG,
17297 SmallVectorImpl<SDNode *> &Created) const {
17299 if (isIntDivCheap(N->getValueType(0), Attr))
17300 return SDValue(N,0); // Lower SDIV as SDIV
17301
17302 EVT VT = N->getValueType(0);
17303
17304 // For scalable and fixed types, mark them as cheap so we can handle it much
17305 // later. This allows us to handle larger than legal types.
17306 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17307 return SDValue(N, 0);
17308
17309 // fold (sdiv X, pow2)
17310 if ((VT != MVT::i32 && VT != MVT::i64) ||
17311 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17312 return SDValue();
17313
17314 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17315}
17316
17317SDValue
17318AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17319 SelectionDAG &DAG,
17320 SmallVectorImpl<SDNode *> &Created) const {
17322 if (isIntDivCheap(N->getValueType(0), Attr))
17323 return SDValue(N, 0); // Lower SREM as SREM
17324
17325 EVT VT = N->getValueType(0);
17326
17327 // For scalable and fixed types, mark them as cheap so we can handle it much
17328 // later. This allows us to handle larger than legal types.
17329 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17330 return SDValue(N, 0);
17331
17332 // fold (srem X, pow2)
17333 if ((VT != MVT::i32 && VT != MVT::i64) ||
17334 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17335 return SDValue();
17336
17337 unsigned Lg2 = Divisor.countr_zero();
17338 if (Lg2 == 0)
17339 return SDValue();
17340
17341 SDLoc DL(N);
17342 SDValue N0 = N->getOperand(0);
17343 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17344 SDValue Zero = DAG.getConstant(0, DL, VT);
17345 SDValue CCVal, CSNeg;
17346 if (Lg2 == 1) {
17347 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17348 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17349 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17350
17351 Created.push_back(Cmp.getNode());
17352 Created.push_back(And.getNode());
17353 } else {
17354 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17355 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17356
17357 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17358 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17359 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17360 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17361 Negs.getValue(1));
17362
17363 Created.push_back(Negs.getNode());
17364 Created.push_back(AndPos.getNode());
17365 Created.push_back(AndNeg.getNode());
17366 }
17367
17368 return CSNeg;
17369}
17370
17371static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17372 switch(getIntrinsicID(S.getNode())) {
17373 default:
17374 break;
17375 case Intrinsic::aarch64_sve_cntb:
17376 return 8;
17377 case Intrinsic::aarch64_sve_cnth:
17378 return 16;
17379 case Intrinsic::aarch64_sve_cntw:
17380 return 32;
17381 case Intrinsic::aarch64_sve_cntd:
17382 return 64;
17383 }
17384 return {};
17385}
17386
17387/// Calculates what the pre-extend type is, based on the extension
17388/// operation node provided by \p Extend.
17389///
17390/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17391/// pre-extend type is pulled directly from the operand, while other extend
17392/// operations need a bit more inspection to get this information.
17393///
17394/// \param Extend The SDNode from the DAG that represents the extend operation
17395///
17396/// \returns The type representing the \p Extend source type, or \p MVT::Other
17397/// if no valid type can be determined
17399 switch (Extend.getOpcode()) {
17400 case ISD::SIGN_EXTEND:
17401 case ISD::ZERO_EXTEND:
17402 return Extend.getOperand(0).getValueType();
17403 case ISD::AssertSext:
17404 case ISD::AssertZext:
17406 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17407 if (!TypeNode)
17408 return MVT::Other;
17409 return TypeNode->getVT();
17410 }
17411 case ISD::AND: {
17413 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17414 if (!Constant)
17415 return MVT::Other;
17416
17417 uint32_t Mask = Constant->getZExtValue();
17418
17419 if (Mask == UCHAR_MAX)
17420 return MVT::i8;
17421 else if (Mask == USHRT_MAX)
17422 return MVT::i16;
17423 else if (Mask == UINT_MAX)
17424 return MVT::i32;
17425
17426 return MVT::Other;
17427 }
17428 default:
17429 return MVT::Other;
17430 }
17431}
17432
17433/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17434/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17435/// SExt/ZExt rather than the scalar SExt/ZExt
17437 EVT VT = BV.getValueType();
17438 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17440 return SDValue();
17441
17442 // Use the first item in the buildvector/shuffle to get the size of the
17443 // extend, and make sure it looks valid.
17444 SDValue Extend = BV->getOperand(0);
17445 unsigned ExtendOpcode = Extend.getOpcode();
17446 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17447 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17448 ExtendOpcode == ISD::AssertSext;
17449 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17450 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17451 return SDValue();
17452 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17453 // calculatePreExtendType will work without issue.
17454 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17455 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17456 return SDValue();
17457
17458 // Restrict valid pre-extend data type
17459 EVT PreExtendType = calculatePreExtendType(Extend);
17460 if (PreExtendType == MVT::Other ||
17461 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17462 return SDValue();
17463
17464 // Make sure all other operands are equally extended
17465 for (SDValue Op : drop_begin(BV->ops())) {
17466 if (Op.isUndef())
17467 continue;
17468 unsigned Opc = Op.getOpcode();
17469 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17470 Opc == ISD::AssertSext;
17471 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17472 return SDValue();
17473 }
17474
17475 SDValue NBV;
17476 SDLoc DL(BV);
17477 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17478 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17479 EVT PreExtendLegalType =
17480 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17482 for (SDValue Op : BV->ops())
17483 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17484 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17485 PreExtendLegalType));
17486 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17487 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17488 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17489 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17490 BV.getOperand(1).isUndef()
17491 ? DAG.getUNDEF(PreExtendVT)
17492 : BV.getOperand(1).getOperand(0),
17493 cast<ShuffleVectorSDNode>(BV)->getMask());
17494 }
17495 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17496}
17497
17498/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17499/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17501 // If the value type isn't a vector, none of the operands are going to be dups
17502 EVT VT = Mul->getValueType(0);
17503 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17504 return SDValue();
17505
17506 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17507 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17508
17509 // Neither operands have been changed, don't make any further changes
17510 if (!Op0 && !Op1)
17511 return SDValue();
17512
17513 SDLoc DL(Mul);
17514 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17515 Op1 ? Op1 : Mul->getOperand(1));
17516}
17517
17518// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17519// Same for other types with equivalent constants.
17521 EVT VT = N->getValueType(0);
17522 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17523 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17524 return SDValue();
17525 if (N->getOperand(0).getOpcode() != ISD::AND ||
17526 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17527 return SDValue();
17528
17529 SDValue And = N->getOperand(0);
17530 SDValue Srl = And.getOperand(0);
17531
17532 APInt V1, V2, V3;
17533 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17534 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17536 return SDValue();
17537
17538 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17539 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17540 V3 != (HalfSize - 1))
17541 return SDValue();
17542
17543 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17544 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17545 VT.getVectorElementCount() * 2);
17546
17547 SDLoc DL(N);
17548 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17549 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17550 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17551}
17552
17555 const AArch64Subtarget *Subtarget) {
17556
17557 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17558 return Ext;
17560 return Ext;
17561
17562 if (DCI.isBeforeLegalizeOps())
17563 return SDValue();
17564
17565 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17566 // and in MachineCombiner pass, add+mul will be combined into madd.
17567 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17568 SDLoc DL(N);
17569 EVT VT = N->getValueType(0);
17570 SDValue N0 = N->getOperand(0);
17571 SDValue N1 = N->getOperand(1);
17572 SDValue MulOper;
17573 unsigned AddSubOpc;
17574
17575 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17576 AddSubOpc = V->getOpcode();
17577 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17578 SDValue Opnd = V->getOperand(1);
17579 MulOper = V->getOperand(0);
17580 if (AddSubOpc == ISD::SUB)
17581 std::swap(Opnd, MulOper);
17582 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17583 return C->isOne();
17584 }
17585 return false;
17586 };
17587
17588 if (IsAddSubWith1(N0)) {
17589 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17590 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17591 }
17592
17593 if (IsAddSubWith1(N1)) {
17594 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17595 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17596 }
17597
17598 // The below optimizations require a constant RHS.
17599 if (!isa<ConstantSDNode>(N1))
17600 return SDValue();
17601
17602 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17603 const APInt &ConstValue = C->getAPIntValue();
17604
17605 // Allow the scaling to be folded into the `cnt` instruction by preventing
17606 // the scaling to be obscured here. This makes it easier to pattern match.
17607 if (IsSVECntIntrinsic(N0) ||
17608 (N0->getOpcode() == ISD::TRUNCATE &&
17609 (IsSVECntIntrinsic(N0->getOperand(0)))))
17610 if (ConstValue.sge(1) && ConstValue.sle(16))
17611 return SDValue();
17612
17613 // Multiplication of a power of two plus/minus one can be done more
17614 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17615 // future CPUs have a cheaper MADD instruction, this may need to be
17616 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17617 // 64-bit is 5 cycles, so this is always a win.
17618 // More aggressively, some multiplications N0 * C can be lowered to
17619 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17620 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17621 // TODO: lower more cases.
17622
17623 // TrailingZeroes is used to test if the mul can be lowered to
17624 // shift+add+shift.
17625 unsigned TrailingZeroes = ConstValue.countr_zero();
17626 if (TrailingZeroes) {
17627 // Conservatively do not lower to shift+add+shift if the mul might be
17628 // folded into smul or umul.
17629 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17630 isZeroExtended(N0, DAG)))
17631 return SDValue();
17632 // Conservatively do not lower to shift+add+shift if the mul might be
17633 // folded into madd or msub.
17634 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17635 N->use_begin()->getOpcode() == ISD::SUB))
17636 return SDValue();
17637 }
17638 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17639 // and shift+add+shift.
17640 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17641 unsigned ShiftAmt;
17642
17643 auto Shl = [&](SDValue N0, unsigned N1) {
17644 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17645 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17646 };
17647 auto Add = [&](SDValue N0, SDValue N1) {
17648 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17649 };
17650 auto Sub = [&](SDValue N0, SDValue N1) {
17651 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17652 };
17653 auto Negate = [&](SDValue N) {
17654 SDValue Zero = DAG.getConstant(0, DL, VT);
17655 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17656 };
17657
17658 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17659 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17660 // the (2^N - 1) can't be execused via a single instruction.
17661 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17662 unsigned BitWidth = C.getBitWidth();
17663 for (unsigned i = 1; i < BitWidth / 2; i++) {
17664 APInt Rem;
17665 APInt X(BitWidth, (1 << i) + 1);
17666 APInt::sdivrem(C, X, N, Rem);
17667 APInt NVMinus1 = N - 1;
17668 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17669 M = X;
17670 return true;
17671 }
17672 }
17673 return false;
17674 };
17675
17676 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
17677 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
17678 // the (2^N - 1) can't be execused via a single instruction.
17679 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
17680 APInt CVMinus1 = C - 1;
17681 if (CVMinus1.isNegative())
17682 return false;
17683 unsigned TrailingZeroes = CVMinus1.countr_zero();
17684 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
17685 if (SCVMinus1.isPowerOf2()) {
17686 unsigned BitWidth = SCVMinus1.getBitWidth();
17687 M = APInt(BitWidth, SCVMinus1.logBase2());
17688 N = APInt(BitWidth, TrailingZeroes);
17689 return true;
17690 }
17691 return false;
17692 };
17693
17694 if (ConstValue.isNonNegative()) {
17695 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17696 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17697 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17698 // (mul x, (2^M + 1) * (2^N + 1))
17699 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17700 // (mul x, (2^M + 1) * 2^N + 1))
17701 // => MV = add (shl x, M), x); add (shl MV, N), x)
17702 APInt SCVMinus1 = ShiftedConstValue - 1;
17703 APInt SCVPlus1 = ShiftedConstValue + 1;
17704 APInt CVPlus1 = ConstValue + 1;
17705 APInt CVM, CVN;
17706 if (SCVMinus1.isPowerOf2()) {
17707 ShiftAmt = SCVMinus1.logBase2();
17708 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17709 } else if (CVPlus1.isPowerOf2()) {
17710 ShiftAmt = CVPlus1.logBase2();
17711 return Sub(Shl(N0, ShiftAmt), N0);
17712 } else if (SCVPlus1.isPowerOf2()) {
17713 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17714 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17715 }
17716 if (Subtarget->hasALULSLFast() &&
17717 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17718 APInt CVMMinus1 = CVM - 1;
17719 APInt CVNMinus1 = CVN - 1;
17720 unsigned ShiftM1 = CVMMinus1.logBase2();
17721 unsigned ShiftN1 = CVNMinus1.logBase2();
17722 // ALULSLFast implicate that Shifts <= 4 places are fast
17723 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
17724 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17725 return Add(Shl(MVal, ShiftN1), MVal);
17726 }
17727 }
17728 if (Subtarget->hasALULSLFast() &&
17729 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
17730 unsigned ShiftM = CVM.getZExtValue();
17731 unsigned ShiftN = CVN.getZExtValue();
17732 // ALULSLFast implicate that Shifts <= 4 places are fast
17733 if (ShiftM <= 4 && ShiftN <= 4) {
17734 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
17735 return Add(Shl(MVal, CVN.getZExtValue()), N0);
17736 }
17737 }
17738 } else {
17739 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17740 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17741 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17742 APInt SCVPlus1 = -ShiftedConstValue + 1;
17743 APInt CVNegPlus1 = -ConstValue + 1;
17744 APInt CVNegMinus1 = -ConstValue - 1;
17745 if (CVNegPlus1.isPowerOf2()) {
17746 ShiftAmt = CVNegPlus1.logBase2();
17747 return Sub(N0, Shl(N0, ShiftAmt));
17748 } else if (CVNegMinus1.isPowerOf2()) {
17749 ShiftAmt = CVNegMinus1.logBase2();
17750 return Negate(Add(Shl(N0, ShiftAmt), N0));
17751 } else if (SCVPlus1.isPowerOf2()) {
17752 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17753 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17754 }
17755 }
17756
17757 return SDValue();
17758}
17759
17761 SelectionDAG &DAG) {
17762 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17763 // optimize away operation when it's from a constant.
17764 //
17765 // The general transformation is:
17766 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17767 // AND(VECTOR_CMP(x,y), constant2)
17768 // constant2 = UNARYOP(constant)
17769
17770 // Early exit if this isn't a vector operation, the operand of the
17771 // unary operation isn't a bitwise AND, or if the sizes of the operations
17772 // aren't the same.
17773 EVT VT = N->getValueType(0);
17774 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17775 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17776 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17777 return SDValue();
17778
17779 // Now check that the other operand of the AND is a constant. We could
17780 // make the transformation for non-constant splats as well, but it's unclear
17781 // that would be a benefit as it would not eliminate any operations, just
17782 // perform one more step in scalar code before moving to the vector unit.
17783 if (BuildVectorSDNode *BV =
17784 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17785 // Bail out if the vector isn't a constant.
17786 if (!BV->isConstant())
17787 return SDValue();
17788
17789 // Everything checks out. Build up the new and improved node.
17790 SDLoc DL(N);
17791 EVT IntVT = BV->getValueType(0);
17792 // Create a new constant of the appropriate type for the transformed
17793 // DAG.
17794 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17795 // The AND node needs bitcasts to/from an integer vector type around it.
17796 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17797 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17798 N->getOperand(0)->getOperand(0), MaskConst);
17799 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17800 return Res;
17801 }
17802
17803 return SDValue();
17804}
17805
17807 const AArch64Subtarget *Subtarget) {
17808 // First try to optimize away the conversion when it's conditionally from
17809 // a constant. Vectors only.
17811 return Res;
17812
17813 EVT VT = N->getValueType(0);
17814 if (VT != MVT::f32 && VT != MVT::f64)
17815 return SDValue();
17816
17817 // Only optimize when the source and destination types have the same width.
17818 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17819 return SDValue();
17820
17821 // If the result of an integer load is only used by an integer-to-float
17822 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17823 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17824 SDValue N0 = N->getOperand(0);
17825 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17826 N0.hasOneUse() &&
17827 // Do not change the width of a volatile load.
17828 !cast<LoadSDNode>(N0)->isVolatile()) {
17829 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17830 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17831 LN0->getPointerInfo(), LN0->getAlign(),
17832 LN0->getMemOperand()->getFlags());
17833
17834 // Make sure successors of the original load stay after it by updating them
17835 // to use the new Chain.
17836 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17837
17838 unsigned Opcode =
17840 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17841 }
17842
17843 return SDValue();
17844}
17845
17846/// Fold a floating-point multiply by power of two into floating-point to
17847/// fixed-point conversion.
17850 const AArch64Subtarget *Subtarget) {
17851 if (!Subtarget->isNeonAvailable())
17852 return SDValue();
17853
17854 if (!N->getValueType(0).isSimple())
17855 return SDValue();
17856
17857 SDValue Op = N->getOperand(0);
17858 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17859 return SDValue();
17860
17861 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17862 return SDValue();
17863
17864 SDValue ConstVec = Op->getOperand(1);
17865 if (!isa<BuildVectorSDNode>(ConstVec))
17866 return SDValue();
17867
17868 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17869 uint32_t FloatBits = FloatTy.getSizeInBits();
17870 if (FloatBits != 32 && FloatBits != 64 &&
17871 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17872 return SDValue();
17873
17874 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17875 uint32_t IntBits = IntTy.getSizeInBits();
17876 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17877 return SDValue();
17878
17879 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17880 if (IntBits > FloatBits)
17881 return SDValue();
17882
17883 BitVector UndefElements;
17884 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17885 int32_t Bits = IntBits == 64 ? 64 : 32;
17886 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17887 if (C == -1 || C == 0 || C > Bits)
17888 return SDValue();
17889
17890 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17891 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17892 return SDValue();
17893
17894 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17895 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17896 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17897 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17898 return SDValue();
17899 }
17900
17901 SDLoc DL(N);
17902 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17903 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17904 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17905 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17906 SDValue FixConv =
17908 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17909 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17910 // We can handle smaller integers by generating an extra trunc.
17911 if (IntBits < FloatBits)
17912 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17913
17914 return FixConv;
17915}
17916
17917/// Fold a floating-point divide by power of two into fixed-point to
17918/// floating-point conversion.
17921 const AArch64Subtarget *Subtarget) {
17922 if (!Subtarget->hasNEON())
17923 return SDValue();
17924
17925 SDValue Op = N->getOperand(0);
17926 unsigned Opc = Op->getOpcode();
17927 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17928 !Op.getOperand(0).getValueType().isSimple() ||
17929 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17930 return SDValue();
17931
17932 SDValue ConstVec = N->getOperand(1);
17933 if (!isa<BuildVectorSDNode>(ConstVec))
17934 return SDValue();
17935
17936 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17937 int32_t IntBits = IntTy.getSizeInBits();
17938 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17939 return SDValue();
17940
17941 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17942 int32_t FloatBits = FloatTy.getSizeInBits();
17943 if (FloatBits != 32 && FloatBits != 64)
17944 return SDValue();
17945
17946 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17947 if (IntBits > FloatBits)
17948 return SDValue();
17949
17950 BitVector UndefElements;
17951 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17952 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17953 if (C == -1 || C == 0 || C > FloatBits)
17954 return SDValue();
17955
17956 MVT ResTy;
17957 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17958 switch (NumLanes) {
17959 default:
17960 return SDValue();
17961 case 2:
17962 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17963 break;
17964 case 4:
17965 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17966 break;
17967 }
17968
17969 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17970 return SDValue();
17971
17972 SDLoc DL(N);
17973 SDValue ConvInput = Op.getOperand(0);
17974 bool IsSigned = Opc == ISD::SINT_TO_FP;
17975 if (IntBits < FloatBits)
17976 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17977 ResTy, ConvInput);
17978
17979 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17980 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17981 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17982 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17983 DAG.getConstant(C, DL, MVT::i32));
17984}
17985
17987 const AArch64TargetLowering &TLI) {
17988 EVT VT = N->getValueType(0);
17989 SelectionDAG &DAG = DCI.DAG;
17990 SDLoc DL(N);
17991 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17992
17993 if (!VT.isVector())
17994 return SDValue();
17995
17996 // The combining code works for NEON, SVE2 and SME.
17997 if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
17998 (VT.isScalableVector() && !Subtarget.hasSVE2()))
17999 return SDValue();
18000
18001 SDValue N0 = N->getOperand(0);
18002 if (N0.getOpcode() != ISD::AND)
18003 return SDValue();
18004
18005 SDValue N1 = N->getOperand(1);
18006 if (N1.getOpcode() != ISD::AND)
18007 return SDValue();
18008
18009 // InstCombine does (not (neg a)) => (add a -1).
18010 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
18011 // Loop over all combinations of AND operands.
18012 for (int i = 1; i >= 0; --i) {
18013 for (int j = 1; j >= 0; --j) {
18014 SDValue O0 = N0->getOperand(i);
18015 SDValue O1 = N1->getOperand(j);
18016 SDValue Sub, Add, SubSibling, AddSibling;
18017
18018 // Find a SUB and an ADD operand, one from each AND.
18019 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18020 Sub = O0;
18021 Add = O1;
18022 SubSibling = N0->getOperand(1 - i);
18023 AddSibling = N1->getOperand(1 - j);
18024 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18025 Add = O0;
18026 Sub = O1;
18027 AddSibling = N0->getOperand(1 - i);
18028 SubSibling = N1->getOperand(1 - j);
18029 } else
18030 continue;
18031
18033 continue;
18034
18035 // Constant ones is always righthand operand of the Add.
18036 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
18037 continue;
18038
18039 if (Sub.getOperand(1) != Add.getOperand(0))
18040 continue;
18041
18042 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18043 }
18044 }
18045
18046 // (or (and a b) (and (not a) c)) => (bsl a b c)
18047 // We only have to look for constant vectors here since the general, variable
18048 // case can be handled in TableGen.
18049 unsigned Bits = VT.getScalarSizeInBits();
18050 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18051 for (int i = 1; i >= 0; --i)
18052 for (int j = 1; j >= 0; --j) {
18053 APInt Val1, Val2;
18054
18055 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
18057 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18058 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18059 N0->getOperand(1 - i), N1->getOperand(1 - j));
18060 }
18061 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18062 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18063 if (!BVN0 || !BVN1)
18064 continue;
18065
18066 bool FoundMatch = true;
18067 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18068 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18069 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18070 if (!CN0 || !CN1 ||
18071 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18072 FoundMatch = false;
18073 break;
18074 }
18075 }
18076 if (FoundMatch)
18077 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18078 N0->getOperand(1 - i), N1->getOperand(1 - j));
18079 }
18080
18081 return SDValue();
18082}
18083
18084// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18085// convert to csel(ccmp(.., cc0)), depending on cc1:
18086
18087// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18088// =>
18089// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18090//
18091// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18092// =>
18093// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18095 EVT VT = N->getValueType(0);
18096 SDValue CSel0 = N->getOperand(0);
18097 SDValue CSel1 = N->getOperand(1);
18098
18099 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18100 CSel1.getOpcode() != AArch64ISD::CSEL)
18101 return SDValue();
18102
18103 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18104 return SDValue();
18105
18106 if (!isNullConstant(CSel0.getOperand(0)) ||
18107 !isOneConstant(CSel0.getOperand(1)) ||
18108 !isNullConstant(CSel1.getOperand(0)) ||
18109 !isOneConstant(CSel1.getOperand(1)))
18110 return SDValue();
18111
18112 SDValue Cmp0 = CSel0.getOperand(3);
18113 SDValue Cmp1 = CSel1.getOperand(3);
18116 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18117 return SDValue();
18118 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18119 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18120 std::swap(Cmp0, Cmp1);
18121 std::swap(CC0, CC1);
18122 }
18123
18124 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18125 return SDValue();
18126
18127 SDLoc DL(N);
18128 SDValue CCmp, Condition;
18129 unsigned NZCV;
18130
18131 if (N->getOpcode() == ISD::AND) {
18133 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18135 } else {
18137 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18139 }
18140
18141 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18142
18143 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18144 if (Op1 && Op1->getAPIntValue().isNegative() &&
18145 Op1->getAPIntValue().sgt(-32)) {
18146 // CCMP accept the constant int the range [0, 31]
18147 // if the Op1 is a constant in the range [-31, -1], we
18148 // can select to CCMN to avoid the extra mov
18149 SDValue AbsOp1 =
18150 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18151 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18152 NZCVOp, Condition, Cmp0);
18153 } else {
18154 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18155 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18156 }
18157 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18158 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18159 CCmp);
18160}
18161
18163 const AArch64Subtarget *Subtarget,
18164 const AArch64TargetLowering &TLI) {
18165 SelectionDAG &DAG = DCI.DAG;
18166 EVT VT = N->getValueType(0);
18167
18168 if (SDValue R = performANDORCSELCombine(N, DAG))
18169 return R;
18170
18171 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18172 return SDValue();
18173
18174 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18175 return Res;
18176
18177 return SDValue();
18178}
18179
18181 if (!MemVT.getVectorElementType().isSimple())
18182 return false;
18183
18184 uint64_t MaskForTy = 0ull;
18185 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18186 case MVT::i8:
18187 MaskForTy = 0xffull;
18188 break;
18189 case MVT::i16:
18190 MaskForTy = 0xffffull;
18191 break;
18192 case MVT::i32:
18193 MaskForTy = 0xffffffffull;
18194 break;
18195 default:
18196 return false;
18197 break;
18198 }
18199
18200 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18201 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18202 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18203
18204 return false;
18205}
18206
18208 SDValue LeafOp = SDValue(N, 0);
18209 SDValue Op = N->getOperand(0);
18210 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18211 LeafOp.getValueType() != Op.getValueType())
18212 Op = Op->getOperand(0);
18213 if (LeafOp.getValueType() == Op.getValueType())
18214 return Op;
18215 return SDValue();
18216}
18217
18220 SelectionDAG &DAG = DCI.DAG;
18221 SDValue Src = N->getOperand(0);
18222 unsigned Opc = Src->getOpcode();
18223
18224 // Zero/any extend of an unsigned unpack
18225 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18226 SDValue UnpkOp = Src->getOperand(0);
18227 SDValue Dup = N->getOperand(1);
18228
18229 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18230 return SDValue();
18231
18232 SDLoc DL(N);
18233 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18234 if (!C)
18235 return SDValue();
18236
18237 uint64_t ExtVal = C->getZExtValue();
18238
18239 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18240 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18241 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18242 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18243 };
18244
18245 // If the mask is fully covered by the unpack, we don't need to push
18246 // a new AND onto the operand
18247 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18248 if (MaskAndTypeMatch(EltTy))
18249 return Src;
18250
18251 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18252 // to see if the mask is all-ones of size MemTy.
18253 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18254 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18255 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18256 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18257 if (MaskAndTypeMatch(EltTy))
18258 return Src;
18259 }
18260
18261 // Truncate to prevent a DUP with an over wide constant
18262 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18263
18264 // Otherwise, make sure we propagate the AND to the operand
18265 // of the unpack
18266 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18267 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18268
18269 SDValue And = DAG.getNode(ISD::AND, DL,
18270 UnpkOp->getValueType(0), UnpkOp, Dup);
18271
18272 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18273 }
18274
18275 if (DCI.isBeforeLegalizeOps())
18276 return SDValue();
18277
18278 // If both sides of AND operations are i1 splat_vectors then
18279 // we can produce just i1 splat_vector as the result.
18280 if (isAllActivePredicate(DAG, N->getOperand(0)))
18281 return N->getOperand(1);
18282 if (isAllActivePredicate(DAG, N->getOperand(1)))
18283 return N->getOperand(0);
18284
18286 return SDValue();
18287
18288 SDValue Mask = N->getOperand(1);
18289
18290 if (!Src.hasOneUse())
18291 return SDValue();
18292
18293 EVT MemVT;
18294
18295 // SVE load instructions perform an implicit zero-extend, which makes them
18296 // perfect candidates for combining.
18297 switch (Opc) {
18301 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18302 break;
18318 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18319 break;
18320 default:
18321 return SDValue();
18322 }
18323
18324 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18325 return Src;
18326
18327 return SDValue();
18328}
18329
18330// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18333
18334 // This function performs an optimization on a specific pattern involving
18335 // an AND operation and SETCC (Set Condition Code) node.
18336
18337 SDValue SetCC = N->getOperand(0);
18338 EVT VT = N->getValueType(0);
18339 SelectionDAG &DAG = DCI.DAG;
18340
18341 // Checks if the current node (N) is used by any SELECT instruction and
18342 // returns an empty SDValue to avoid applying the optimization to prevent
18343 // incorrect results
18344 for (auto U : N->uses())
18345 if (U->getOpcode() == ISD::SELECT)
18346 return SDValue();
18347
18348 // Check if the operand is a SETCC node with floating-point comparison
18349 if (SetCC.getOpcode() == ISD::SETCC &&
18350 SetCC.getOperand(0).getValueType() == MVT::f32) {
18351
18352 SDValue Cmp;
18354
18355 // Check if the DAG is after legalization and if we can emit the conjunction
18356 if (!DCI.isBeforeLegalize() &&
18357 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18358
18360
18361 SDLoc DL(N);
18362 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18363 DAG.getConstant(0, DL, VT),
18364 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18365 }
18366 }
18367 return SDValue();
18368}
18369
18372 SelectionDAG &DAG = DCI.DAG;
18373 SDValue LHS = N->getOperand(0);
18374 SDValue RHS = N->getOperand(1);
18375 EVT VT = N->getValueType(0);
18376
18377 if (SDValue R = performANDORCSELCombine(N, DAG))
18378 return R;
18379
18380 if (SDValue R = performANDSETCCCombine(N,DCI))
18381 return R;
18382
18383 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18384 return SDValue();
18385
18386 if (VT.isScalableVector())
18387 return performSVEAndCombine(N, DCI);
18388
18389 // The combining code below works only for NEON vectors. In particular, it
18390 // does not work for SVE when dealing with vectors wider than 128 bits.
18391 if (!VT.is64BitVector() && !VT.is128BitVector())
18392 return SDValue();
18393
18394 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18395 if (!BVN)
18396 return SDValue();
18397
18398 // AND does not accept an immediate, so check if we can use a BIC immediate
18399 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18400 // pattern in isel, because some immediates may be lowered to the preferred
18401 // (and x, (movi imm)) form, even though an mvni representation also exists.
18402 APInt DefBits(VT.getSizeInBits(), 0);
18403 APInt UndefBits(VT.getSizeInBits(), 0);
18404 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18405 SDValue NewOp;
18406
18407 // Any bits known to already be 0 need not be cleared again, which can help
18408 // reduce the size of the immediate to one supported by the instruction.
18409 KnownBits Known = DAG.computeKnownBits(LHS);
18410 APInt ZeroSplat(VT.getSizeInBits(), 0);
18411 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18412 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18413 << (Known.Zero.getBitWidth() * I);
18414
18415 DefBits = ~(DefBits | ZeroSplat);
18416 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18417 DefBits, &LHS)) ||
18418 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18419 DefBits, &LHS)))
18420 return NewOp;
18421
18422 UndefBits = ~(UndefBits | ZeroSplat);
18423 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18424 UndefBits, &LHS)) ||
18425 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18426 UndefBits, &LHS)))
18427 return NewOp;
18428 }
18429
18430 return SDValue();
18431}
18432
18435 SelectionDAG &DAG = DCI.DAG;
18436 SDValue LHS = N->getOperand(0);
18437 SDValue RHS = N->getOperand(1);
18438 EVT VT = N->getValueType(0);
18439 SDLoc DL(N);
18440
18441 if (!N->getFlags().hasAllowReassociation())
18442 return SDValue();
18443
18444 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18445 auto ReassocComplex = [&](SDValue A, SDValue B) {
18446 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18447 return SDValue();
18448 unsigned Opc = A.getConstantOperandVal(0);
18449 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18450 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18451 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18452 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18453 return SDValue();
18454 SDValue VCMLA = DAG.getNode(
18455 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18456 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18457 A.getOperand(2), A.getOperand(3));
18458 VCMLA->setFlags(A->getFlags());
18459 return VCMLA;
18460 };
18461 if (SDValue R = ReassocComplex(LHS, RHS))
18462 return R;
18463 if (SDValue R = ReassocComplex(RHS, LHS))
18464 return R;
18465
18466 return SDValue();
18467}
18468
18469static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18470 switch (Opcode) {
18471 case ISD::STRICT_FADD:
18472 case ISD::FADD:
18473 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18474 case ISD::ADD:
18475 return VT == MVT::i64;
18476 default:
18477 return false;
18478 }
18479}
18480
18481static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18483
18485 if ((N.getOpcode() == ISD::SETCC) ||
18486 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18487 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18488 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18489 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18490 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18491 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18492 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18493 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18494 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18495 // get_active_lane_mask is lowered to a whilelo instruction.
18496 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18497 return true;
18498
18499 return false;
18500}
18501
18502// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18503// ... into: "ptrue p, all" + PTEST
18504static SDValue
18507 const AArch64Subtarget *Subtarget) {
18508 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18509 // Make sure PTEST can be legalised with illegal types.
18510 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18511 return SDValue();
18512
18513 SDValue N0 = N->getOperand(0);
18514 EVT VT = N0.getValueType();
18515
18516 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18517 !isNullConstant(N->getOperand(1)))
18518 return SDValue();
18519
18520 // Restricted the DAG combine to only cases where we're extracting from a
18521 // flag-setting operation.
18522 if (!isPredicateCCSettingOp(N0))
18523 return SDValue();
18524
18525 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18526 SelectionDAG &DAG = DCI.DAG;
18527 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18528 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18529}
18530
18531// Materialize : Idx = (add (mul vscale, NumEls), -1)
18532// i1 = extract_vector_elt t37, Constant:i64<Idx>
18533// ... into: "ptrue p, all" + PTEST
18534static SDValue
18537 const AArch64Subtarget *Subtarget) {
18538 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18539 // Make sure PTEST is legal types.
18540 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18541 return SDValue();
18542
18543 SDValue N0 = N->getOperand(0);
18544 EVT OpVT = N0.getValueType();
18545
18546 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18547 return SDValue();
18548
18549 // Idx == (add (mul vscale, NumEls), -1)
18550 SDValue Idx = N->getOperand(1);
18551 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18552 return SDValue();
18553
18554 SDValue VS = Idx.getOperand(0);
18555 if (VS.getOpcode() != ISD::VSCALE)
18556 return SDValue();
18557
18558 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18559 if (VS.getConstantOperandVal(0) != NumEls)
18560 return SDValue();
18561
18562 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18563 SelectionDAG &DAG = DCI.DAG;
18564 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18565 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18566}
18567
18568static SDValue
18570 const AArch64Subtarget *Subtarget) {
18571 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18572 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18573 return Res;
18574 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18575 return Res;
18576
18577 SelectionDAG &DAG = DCI.DAG;
18578 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18579
18580 EVT VT = N->getValueType(0);
18581 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18582 bool IsStrict = N0->isStrictFPOpcode();
18583
18584 // extract(dup x) -> x
18585 if (N0.getOpcode() == AArch64ISD::DUP)
18586 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18587 : N0.getOperand(0);
18588
18589 // Rewrite for pairwise fadd pattern
18590 // (f32 (extract_vector_elt
18591 // (fadd (vXf32 Other)
18592 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18593 // ->
18594 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18595 // (extract_vector_elt (vXf32 Other) 1))
18596 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18597 // we can only do this when it's used only by the extract_vector_elt.
18598 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18599 (!IsStrict || N0.hasOneUse())) {
18600 SDLoc DL(N0);
18601 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18602 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18603
18604 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18605 SDValue Other = N00;
18606
18607 // And handle the commutative case.
18608 if (!Shuffle) {
18609 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18610 Other = N01;
18611 }
18612
18613 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18614 Other == Shuffle->getOperand(0)) {
18615 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18616 DAG.getConstant(0, DL, MVT::i64));
18617 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18618 DAG.getConstant(1, DL, MVT::i64));
18619 if (!IsStrict)
18620 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18621
18622 // For strict_fadd we need uses of the final extract_vector to be replaced
18623 // with the strict_fadd, but we also need uses of the chain output of the
18624 // original strict_fadd to use the chain output of the new strict_fadd as
18625 // otherwise it may not be deleted.
18626 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18627 {VT, MVT::Other},
18628 {N0->getOperand(0), Extract1, Extract2});
18629 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18630 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18631 return SDValue(N, 0);
18632 }
18633 }
18634
18635 return SDValue();
18636}
18637
18640 SelectionDAG &DAG) {
18641 SDLoc dl(N);
18642 EVT VT = N->getValueType(0);
18643 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18644 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18645
18646 if (VT.isScalableVector())
18647 return SDValue();
18648
18649 // Optimize concat_vectors of truncated vectors, where the intermediate
18650 // type is illegal, to avoid said illegality, e.g.,
18651 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18652 // (v2i16 (truncate (v2i64)))))
18653 // ->
18654 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18655 // (v4i32 (bitcast (v2i64))),
18656 // <0, 2, 4, 6>)))
18657 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18658 // on both input and result type, so we might generate worse code.
18659 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18660 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18661 N1Opc == ISD::TRUNCATE) {
18662 SDValue N00 = N0->getOperand(0);
18663 SDValue N10 = N1->getOperand(0);
18664 EVT N00VT = N00.getValueType();
18665
18666 if (N00VT == N10.getValueType() &&
18667 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18668 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18669 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18671 for (size_t i = 0; i < Mask.size(); ++i)
18672 Mask[i] = i * 2;
18673 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18674 DAG.getVectorShuffle(
18675 MidVT, dl,
18676 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18677 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18678 }
18679 }
18680
18681 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18682 N->getOperand(0).getValueType() == MVT::v2i16 ||
18683 N->getOperand(0).getValueType() == MVT::v2i8) {
18684 EVT SrcVT = N->getOperand(0).getValueType();
18685 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18686 // loads to prevent having to go through the v4i8 load legalization that
18687 // needs to extend each element into a larger type.
18688 if (N->getNumOperands() % 2 == 0 &&
18689 all_of(N->op_values(), [SrcVT](SDValue V) {
18690 if (V.getValueType() != SrcVT)
18691 return false;
18692 if (V.isUndef())
18693 return true;
18694 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18695 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18696 LD->getExtensionType() == ISD::NON_EXTLOAD;
18697 })) {
18698 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18699 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
18701
18702 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18703 SDValue V = N->getOperand(i);
18704 if (V.isUndef())
18705 Ops.push_back(DAG.getUNDEF(FVT));
18706 else {
18707 LoadSDNode *LD = cast<LoadSDNode>(V);
18708 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
18709 LD->getBasePtr(), LD->getMemOperand());
18710 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18711 Ops.push_back(NewLoad);
18712 }
18713 }
18714 return DAG.getBitcast(N->getValueType(0),
18715 DAG.getBuildVector(NVT, dl, Ops));
18716 }
18717 }
18718
18719 // Canonicalise concat_vectors to replace concatenations of truncated nots
18720 // with nots of concatenated truncates. This in some cases allows for multiple
18721 // redundant negations to be eliminated.
18722 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18723 // (v4i16 (truncate (not (v4i32)))))
18724 // ->
18725 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18726 // (v4i16 (truncate (v4i32)))))
18727 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18728 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18729 N->isOnlyUserOf(N1.getNode())) {
18730 auto isBitwiseVectorNegate = [](SDValue V) {
18731 return V->getOpcode() == ISD::XOR &&
18732 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18733 };
18734 SDValue N00 = N0->getOperand(0);
18735 SDValue N10 = N1->getOperand(0);
18736 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18737 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18738 return DAG.getNOT(
18739 dl,
18740 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18741 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18742 N00->getOperand(0)),
18743 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18744 N10->getOperand(0))),
18745 VT);
18746 }
18747 }
18748
18749 // Wait till after everything is legalized to try this. That way we have
18750 // legal vector types and such.
18751 if (DCI.isBeforeLegalizeOps())
18752 return SDValue();
18753
18754 // Optimise concat_vectors of two identical binops with a 128-bit destination
18755 // size, combine into an binop of two contacts of the source vectors. eg:
18756 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
18757 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18758 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
18759 N1->hasOneUse()) {
18760 SDValue N00 = N0->getOperand(0);
18761 SDValue N01 = N0->getOperand(1);
18762 SDValue N10 = N1->getOperand(0);
18763 SDValue N11 = N1->getOperand(1);
18764
18765 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18766 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18767 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18768 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18769 }
18770 }
18771
18772 auto IsRSHRN = [](SDValue Shr) {
18773 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18774 return false;
18775 SDValue Op = Shr.getOperand(0);
18776 EVT VT = Op.getValueType();
18777 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18778 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18779 return false;
18780
18781 APInt Imm;
18782 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18783 Imm = APInt(VT.getScalarSizeInBits(),
18784 Op.getOperand(1).getConstantOperandVal(0)
18785 << Op.getOperand(1).getConstantOperandVal(1));
18786 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18787 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18788 Imm = APInt(VT.getScalarSizeInBits(),
18789 Op.getOperand(1).getConstantOperandVal(0));
18790 else
18791 return false;
18792
18793 if (Imm != 1ULL << (ShtAmt - 1))
18794 return false;
18795 return true;
18796 };
18797
18798 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18799 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18800 ((IsRSHRN(N1) &&
18802 N1.isUndef())) {
18803 SDValue X = N0.getOperand(0).getOperand(0);
18804 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18805 : N1.getOperand(0).getOperand(0);
18806 EVT BVT =
18807 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18808 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18809 SDValue Add = DAG.getNode(
18810 ISD::ADD, dl, BVT, CC,
18811 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18812 SDValue Shr =
18813 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18814 return Shr;
18815 }
18816
18817 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18818 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18819 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18820 N0.getOperand(1) == N1.getOperand(1)) {
18821 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18822 DAG.getUNDEF(N0.getValueType()));
18823 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18824 DAG.getUNDEF(N0.getValueType()));
18825 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18826 }
18827
18828 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18829 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18830 // canonicalise to that.
18831 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18832 assert(VT.getScalarSizeInBits() == 64);
18833 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18834 DAG.getConstant(0, dl, MVT::i64));
18835 }
18836
18837 // Canonicalise concat_vectors so that the right-hand vector has as few
18838 // bit-casts as possible before its real operation. The primary matching
18839 // destination for these operations will be the narrowing "2" instructions,
18840 // which depend on the operation being performed on this right-hand vector.
18841 // For example,
18842 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18843 // becomes
18844 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18845
18846 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18847 return SDValue();
18848 SDValue RHS = N1->getOperand(0);
18849 MVT RHSTy = RHS.getValueType().getSimpleVT();
18850 // If the RHS is not a vector, this is not the pattern we're looking for.
18851 if (!RHSTy.isVector())
18852 return SDValue();
18853
18854 LLVM_DEBUG(
18855 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18856
18857 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18858 RHSTy.getVectorNumElements() * 2);
18859 return DAG.getNode(ISD::BITCAST, dl, VT,
18860 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18861 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18862 RHS));
18863}
18864
18865static SDValue
18867 SelectionDAG &DAG) {
18868 if (DCI.isBeforeLegalizeOps())
18869 return SDValue();
18870
18871 EVT VT = N->getValueType(0);
18872 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18873 return SDValue();
18874
18875 SDValue V = N->getOperand(0);
18876
18877 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18878 // blocks this combine because the non-const case requires custom lowering.
18879 //
18880 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18881 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18882 if (isa<ConstantSDNode>(V.getOperand(0)))
18883 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18884
18885 return SDValue();
18886}
18887
18888static SDValue
18890 SelectionDAG &DAG) {
18891 SDLoc DL(N);
18892 SDValue Vec = N->getOperand(0);
18893 SDValue SubVec = N->getOperand(1);
18894 uint64_t IdxVal = N->getConstantOperandVal(2);
18895 EVT VecVT = Vec.getValueType();
18896 EVT SubVT = SubVec.getValueType();
18897
18898 // Only do this for legal fixed vector types.
18899 if (!VecVT.isFixedLengthVector() ||
18900 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18901 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18902 return SDValue();
18903
18904 // Ignore widening patterns.
18905 if (IdxVal == 0 && Vec.isUndef())
18906 return SDValue();
18907
18908 // Subvector must be half the width and an "aligned" insertion.
18909 unsigned NumSubElts = SubVT.getVectorNumElements();
18910 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18911 (IdxVal != 0 && IdxVal != NumSubElts))
18912 return SDValue();
18913
18914 // Fold insert_subvector -> concat_vectors
18915 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18916 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18917 SDValue Lo, Hi;
18918 if (IdxVal == 0) {
18919 Lo = SubVec;
18920 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18921 DAG.getVectorIdxConstant(NumSubElts, DL));
18922 } else {
18923 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18924 DAG.getVectorIdxConstant(0, DL));
18925 Hi = SubVec;
18926 }
18927 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18928}
18929
18932 SelectionDAG &DAG) {
18933 // Wait until after everything is legalized to try this. That way we have
18934 // legal vector types and such.
18935 if (DCI.isBeforeLegalizeOps())
18936 return SDValue();
18937 // Transform a scalar conversion of a value from a lane extract into a
18938 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18939 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18940 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18941 //
18942 // The second form interacts better with instruction selection and the
18943 // register allocator to avoid cross-class register copies that aren't
18944 // coalescable due to a lane reference.
18945
18946 // Check the operand and see if it originates from a lane extract.
18947 SDValue Op1 = N->getOperand(1);
18949 return SDValue();
18950
18951 // Yep, no additional predication needed. Perform the transform.
18952 SDValue IID = N->getOperand(0);
18953 SDValue Shift = N->getOperand(2);
18954 SDValue Vec = Op1.getOperand(0);
18955 SDValue Lane = Op1.getOperand(1);
18956 EVT ResTy = N->getValueType(0);
18957 EVT VecResTy;
18958 SDLoc DL(N);
18959
18960 // The vector width should be 128 bits by the time we get here, even
18961 // if it started as 64 bits (the extract_vector handling will have
18962 // done so). Bail if it is not.
18963 if (Vec.getValueSizeInBits() != 128)
18964 return SDValue();
18965
18966 if (Vec.getValueType() == MVT::v4i32)
18967 VecResTy = MVT::v4f32;
18968 else if (Vec.getValueType() == MVT::v2i64)
18969 VecResTy = MVT::v2f64;
18970 else
18971 return SDValue();
18972
18973 SDValue Convert =
18974 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18975 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18976}
18977
18978// AArch64 high-vector "long" operations are formed by performing the non-high
18979// version on an extract_subvector of each operand which gets the high half:
18980//
18981// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18982//
18983// However, there are cases which don't have an extract_high explicitly, but
18984// have another operation that can be made compatible with one for free. For
18985// example:
18986//
18987// (dupv64 scalar) --> (extract_high (dup128 scalar))
18988//
18989// This routine does the actual conversion of such DUPs, once outer routines
18990// have determined that everything else is in order.
18991// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18992// similarly here.
18994 MVT VT = N.getSimpleValueType();
18995 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18996 N.getConstantOperandVal(1) == 0)
18997 N = N.getOperand(0);
18998
18999 switch (N.getOpcode()) {
19000 case AArch64ISD::DUP:
19005 case AArch64ISD::MOVI:
19011 break;
19012 default:
19013 // FMOV could be supported, but isn't very useful, as it would only occur
19014 // if you passed a bitcast' floating point immediate to an eligible long
19015 // integer op (addl, smull, ...).
19016 return SDValue();
19017 }
19018
19019 if (!VT.is64BitVector())
19020 return SDValue();
19021
19022 SDLoc DL(N);
19023 unsigned NumElems = VT.getVectorNumElements();
19024 if (N.getValueType().is64BitVector()) {
19025 MVT ElementTy = VT.getVectorElementType();
19026 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
19027 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
19028 }
19029
19030 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
19031 DAG.getConstant(NumElems, DL, MVT::i64));
19032}
19033
19035 if (N.getOpcode() == ISD::BITCAST)
19036 N = N.getOperand(0);
19037 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19038 return false;
19039 if (N.getOperand(0).getValueType().isScalableVector())
19040 return false;
19041 return N.getConstantOperandAPInt(1) ==
19042 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19043}
19044
19045/// Helper structure to keep track of ISD::SET_CC operands.
19050};
19051
19052/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19054 const SDValue *Cmp;
19056};
19057
19058/// Helper structure to keep track of SetCC information.
19062};
19063
19064/// Helper structure to be able to read SetCC information. If set to
19065/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19066/// GenericSetCCInfo.
19070};
19071
19072/// Check whether or not \p Op is a SET_CC operation, either a generic or
19073/// an
19074/// AArch64 lowered one.
19075/// \p SetCCInfo is filled accordingly.
19076/// \post SetCCInfo is meanginfull only when this function returns true.
19077/// \return True when Op is a kind of SET_CC operation.
19079 // If this is a setcc, this is straight forward.
19080 if (Op.getOpcode() == ISD::SETCC) {
19081 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19082 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19083 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19084 SetCCInfo.IsAArch64 = false;
19085 return true;
19086 }
19087 // Otherwise, check if this is a matching csel instruction.
19088 // In other words:
19089 // - csel 1, 0, cc
19090 // - csel 0, 1, !cc
19091 if (Op.getOpcode() != AArch64ISD::CSEL)
19092 return false;
19093 // Set the information about the operands.
19094 // TODO: we want the operands of the Cmp not the csel
19095 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19096 SetCCInfo.IsAArch64 = true;
19097 SetCCInfo.Info.AArch64.CC =
19098 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19099
19100 // Check that the operands matches the constraints:
19101 // (1) Both operands must be constants.
19102 // (2) One must be 1 and the other must be 0.
19103 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19104 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19105
19106 // Check (1).
19107 if (!TValue || !FValue)
19108 return false;
19109
19110 // Check (2).
19111 if (!TValue->isOne()) {
19112 // Update the comparison when we are interested in !cc.
19113 std::swap(TValue, FValue);
19114 SetCCInfo.Info.AArch64.CC =
19116 }
19117 return TValue->isOne() && FValue->isZero();
19118}
19119
19120// Returns true if Op is setcc or zext of setcc.
19121static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19122 if (isSetCC(Op, Info))
19123 return true;
19124 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19125 isSetCC(Op->getOperand(0), Info));
19126}
19127
19128// The folding we want to perform is:
19129// (add x, [zext] (setcc cc ...) )
19130// -->
19131// (csel x, (add x, 1), !cc ...)
19132//
19133// The latter will get matched to a CSINC instruction.
19135 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19136 SDValue LHS = Op->getOperand(0);
19137 SDValue RHS = Op->getOperand(1);
19138 SetCCInfoAndKind InfoAndKind;
19139
19140 // If both operands are a SET_CC, then we don't want to perform this
19141 // folding and create another csel as this results in more instructions
19142 // (and higher register usage).
19143 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19144 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19145 return SDValue();
19146
19147 // If neither operand is a SET_CC, give up.
19148 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19149 std::swap(LHS, RHS);
19150 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19151 return SDValue();
19152 }
19153
19154 // FIXME: This could be generatized to work for FP comparisons.
19155 EVT CmpVT = InfoAndKind.IsAArch64
19156 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19157 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19158 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19159 return SDValue();
19160
19161 SDValue CCVal;
19162 SDValue Cmp;
19163 SDLoc dl(Op);
19164 if (InfoAndKind.IsAArch64) {
19165 CCVal = DAG.getConstant(
19167 MVT::i32);
19168 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19169 } else
19170 Cmp = getAArch64Cmp(
19171 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19172 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19173 dl);
19174
19175 EVT VT = Op->getValueType(0);
19176 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19177 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19178}
19179
19180// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19182 EVT VT = N->getValueType(0);
19183 // Only scalar integer and vector types.
19184 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19185 return SDValue();
19186
19187 SDValue LHS = N->getOperand(0);
19188 SDValue RHS = N->getOperand(1);
19189 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19190 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19191 return SDValue();
19192
19193 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19194 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19195 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19196 return SDValue();
19197
19198 SDValue Op1 = LHS->getOperand(0);
19199 SDValue Op2 = RHS->getOperand(0);
19200 EVT OpVT1 = Op1.getValueType();
19201 EVT OpVT2 = Op2.getValueType();
19202 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19203 Op2.getOpcode() != AArch64ISD::UADDV ||
19204 OpVT1.getVectorElementType() != VT)
19205 return SDValue();
19206
19207 SDValue Val1 = Op1.getOperand(0);
19208 SDValue Val2 = Op2.getOperand(0);
19209 EVT ValVT = Val1->getValueType(0);
19210 SDLoc DL(N);
19211 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19212 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19213 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19214 DAG.getConstant(0, DL, MVT::i64));
19215}
19216
19217/// Perform the scalar expression combine in the form of:
19218/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19219/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19221 EVT VT = N->getValueType(0);
19222 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19223 return SDValue();
19224
19225 SDValue LHS = N->getOperand(0);
19226 SDValue RHS = N->getOperand(1);
19227
19228 // Handle commutivity.
19229 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19230 LHS.getOpcode() != AArch64ISD::CSNEG) {
19231 std::swap(LHS, RHS);
19232 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19233 LHS.getOpcode() != AArch64ISD::CSNEG) {
19234 return SDValue();
19235 }
19236 }
19237
19238 if (!LHS.hasOneUse())
19239 return SDValue();
19240
19241 AArch64CC::CondCode AArch64CC =
19242 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19243
19244 // The CSEL should include a const one operand, and the CSNEG should include
19245 // One or NegOne operand.
19246 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19247 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19248 if (!CTVal || !CFVal)
19249 return SDValue();
19250
19251 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19252 (CTVal->isOne() || CFVal->isOne())) &&
19253 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19254 (CTVal->isOne() || CFVal->isAllOnes())))
19255 return SDValue();
19256
19257 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19258 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19259 !CFVal->isOne()) {
19260 std::swap(CTVal, CFVal);
19261 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19262 }
19263
19264 SDLoc DL(N);
19265 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19266 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19267 !CFVal->isAllOnes()) {
19268 APInt C = -1 * CFVal->getAPIntValue();
19269 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19270 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19271 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19272 }
19273
19274 // It might be neutral for larger constants, as the immediate need to be
19275 // materialized in a register.
19276 APInt ADDC = CTVal->getAPIntValue();
19277 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19278 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19279 return SDValue();
19280
19281 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19282 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19283 "Unexpected constant value");
19284
19285 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19286 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19287 SDValue Cmp = LHS.getOperand(3);
19288
19289 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19290}
19291
19292// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19294 EVT VT = N->getValueType(0);
19295 if (N->getOpcode() != ISD::ADD)
19296 return SDValue();
19297
19298 SDValue Dot = N->getOperand(0);
19299 SDValue A = N->getOperand(1);
19300 // Handle commutivity
19301 auto isZeroDot = [](SDValue Dot) {
19302 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19303 Dot.getOpcode() == AArch64ISD::SDOT) &&
19305 };
19306 if (!isZeroDot(Dot))
19307 std::swap(Dot, A);
19308 if (!isZeroDot(Dot))
19309 return SDValue();
19310
19311 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19312 Dot.getOperand(2));
19313}
19314
19316 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19317}
19318
19320 SDLoc DL(Op);
19321 EVT VT = Op.getValueType();
19322 SDValue Zero = DAG.getConstant(0, DL, VT);
19323 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19324}
19325
19326// Try to fold
19327//
19328// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19329//
19330// The folding helps csel to be matched with csneg without generating
19331// redundant neg instruction, which includes negation of the csel expansion
19332// of abs node lowered by lowerABS.
19334 if (!isNegatedInteger(SDValue(N, 0)))
19335 return SDValue();
19336
19337 SDValue CSel = N->getOperand(1);
19338 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19339 return SDValue();
19340
19341 SDValue N0 = CSel.getOperand(0);
19342 SDValue N1 = CSel.getOperand(1);
19343
19344 // If both of them is not negations, it's not worth the folding as it
19345 // introduces two additional negations while reducing one negation.
19346 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19347 return SDValue();
19348
19349 SDValue N0N = getNegatedInteger(N0, DAG);
19350 SDValue N1N = getNegatedInteger(N1, DAG);
19351
19352 SDLoc DL(N);
19353 EVT VT = CSel.getValueType();
19354 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19355 CSel.getOperand(3));
19356}
19357
19358// The basic add/sub long vector instructions have variants with "2" on the end
19359// which act on the high-half of their inputs. They are normally matched by
19360// patterns like:
19361//
19362// (add (zeroext (extract_high LHS)),
19363// (zeroext (extract_high RHS)))
19364// -> uaddl2 vD, vN, vM
19365//
19366// However, if one of the extracts is something like a duplicate, this
19367// instruction can still be used profitably. This function puts the DAG into a
19368// more appropriate form for those patterns to trigger.
19371 SelectionDAG &DAG = DCI.DAG;
19372 if (DCI.isBeforeLegalizeOps())
19373 return SDValue();
19374
19375 MVT VT = N->getSimpleValueType(0);
19376 if (!VT.is128BitVector()) {
19377 if (N->getOpcode() == ISD::ADD)
19378 return performSetccAddFolding(N, DAG);
19379 return SDValue();
19380 }
19381
19382 // Make sure both branches are extended in the same way.
19383 SDValue LHS = N->getOperand(0);
19384 SDValue RHS = N->getOperand(1);
19385 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19386 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19387 LHS.getOpcode() != RHS.getOpcode())
19388 return SDValue();
19389
19390 unsigned ExtType = LHS.getOpcode();
19391
19392 // It's not worth doing if at least one of the inputs isn't already an
19393 // extract, but we don't know which it'll be so we have to try both.
19394 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19395 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19396 if (!RHS.getNode())
19397 return SDValue();
19398
19399 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19400 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19401 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19402 if (!LHS.getNode())
19403 return SDValue();
19404
19405 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19406 }
19407
19408 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19409}
19410
19411static bool isCMP(SDValue Op) {
19412 return Op.getOpcode() == AArch64ISD::SUBS &&
19413 !Op.getNode()->hasAnyUseOfValue(0);
19414}
19415
19416// (CSEL 1 0 CC Cond) => CC
19417// (CSEL 0 1 CC Cond) => !CC
19418static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19419 if (Op.getOpcode() != AArch64ISD::CSEL)
19420 return std::nullopt;
19421 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19422 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19423 return std::nullopt;
19424 SDValue OpLHS = Op.getOperand(0);
19425 SDValue OpRHS = Op.getOperand(1);
19426 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19427 return CC;
19428 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19429 return getInvertedCondCode(CC);
19430
19431 return std::nullopt;
19432}
19433
19434// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19435// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19436static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19437 SDValue CmpOp = Op->getOperand(2);
19438 if (!isCMP(CmpOp))
19439 return SDValue();
19440
19441 if (IsAdd) {
19442 if (!isOneConstant(CmpOp.getOperand(1)))
19443 return SDValue();
19444 } else {
19445 if (!isNullConstant(CmpOp.getOperand(0)))
19446 return SDValue();
19447 }
19448
19449 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19450 auto CC = getCSETCondCode(CsetOp);
19451 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19452 return SDValue();
19453
19454 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19455 Op->getOperand(0), Op->getOperand(1),
19456 CsetOp.getOperand(3));
19457}
19458
19459// (ADC x 0 cond) => (CINC x HS cond)
19461 SDValue LHS = N->getOperand(0);
19462 SDValue RHS = N->getOperand(1);
19463 SDValue Cond = N->getOperand(2);
19464
19465 if (!isNullConstant(RHS))
19466 return SDValue();
19467
19468 EVT VT = N->getValueType(0);
19469 SDLoc DL(N);
19470
19471 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19472 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19473 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19474}
19475
19476// Transform vector add(zext i8 to i32, zext i8 to i32)
19477// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19478// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19479// extends.
19481 EVT VT = N->getValueType(0);
19482 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19483 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19484 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19485 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19486 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19487 N->getOperand(0).getOperand(0).getValueType() !=
19488 N->getOperand(1).getOperand(0).getValueType())
19489 return SDValue();
19490
19491 SDValue N0 = N->getOperand(0).getOperand(0);
19492 SDValue N1 = N->getOperand(1).getOperand(0);
19493 EVT InVT = N0.getValueType();
19494
19495 EVT S1 = InVT.getScalarType();
19496 EVT S2 = VT.getScalarType();
19497 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19498 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19499 SDLoc DL(N);
19500 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19503 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19504 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19505 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19506 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19507 }
19508 return SDValue();
19509}
19510
19513 SelectionDAG &DAG) {
19514 SDLoc DL(N);
19515 EVT VT = N->getValueType(0);
19516
19517 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19518 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19519 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19520 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19521 Elt1->getOpcode() == ISD::FP_ROUND &&
19522 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19523 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19524 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19526 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19527 // Constant index.
19528 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19529 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19530 Elt0->getOperand(0)->getOperand(0) ==
19531 Elt1->getOperand(0)->getOperand(0) &&
19532 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19533 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19534 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19535 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19536 SDValue HighLanes;
19537 if (Elt2->getOpcode() == ISD::UNDEF &&
19538 Elt3->getOpcode() == ISD::UNDEF) {
19539 HighLanes = DAG.getUNDEF(MVT::v2f32);
19540 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19541 Elt3->getOpcode() == ISD::FP_ROUND &&
19542 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19543 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19544 Elt2->getConstantOperandVal(1) ==
19545 Elt3->getConstantOperandVal(1) &&
19546 Elt2->getOperand(0)->getOpcode() ==
19548 Elt3->getOperand(0)->getOpcode() ==
19550 // Constant index.
19551 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19552 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19553 Elt2->getOperand(0)->getOperand(0) ==
19554 Elt3->getOperand(0)->getOperand(0) &&
19555 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19556 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19557 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19558 HighLanes =
19559 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19560 }
19561 if (HighLanes) {
19562 SDValue DoubleToSingleSticky =
19563 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19564 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19565 DoubleToSingleSticky, HighLanes);
19566 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19567 Elt0->getOperand(1));
19568 }
19569 }
19570 }
19571 }
19572
19573 if (VT == MVT::v2f64) {
19574 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19575 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19576 Elt1->getOpcode() == ISD::FP_EXTEND &&
19578 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19579 Elt0->getOperand(0)->getOperand(0) ==
19580 Elt1->getOperand(0)->getOperand(0) &&
19581 // Constant index.
19582 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19583 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19584 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19585 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19586 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19587 // ResultType's known minimum vector length.
19588 Elt0->getOperand(0)->getConstantOperandVal(1) %
19590 0) {
19591 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19592 if (SrcVec.getValueType() == MVT::v4f16 ||
19593 SrcVec.getValueType() == MVT::v4bf16) {
19594 SDValue HalfToSingle =
19595 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19596 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19597 SDValue Extract = DAG.getNode(
19599 HalfToSingle, SubvectorIdx);
19600 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19601 }
19602 }
19603 }
19604
19605 // A build vector of two extracted elements is equivalent to an
19606 // extract subvector where the inner vector is any-extended to the
19607 // extract_vector_elt VT.
19608 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19609 // (extract_elt_iXX_to_i32 vec Idx+1))
19610 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19611
19612 // For now, only consider the v2i32 case, which arises as a result of
19613 // legalization.
19614 if (VT != MVT::v2i32)
19615 return SDValue();
19616
19617 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19618 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19619 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19620 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19621 // Constant index.
19622 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19623 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19624 // Both EXTRACT_VECTOR_ELT from same vector...
19625 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19626 // ... and contiguous. First element's index +1 == second element's index.
19627 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19628 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19629 // ResultType's known minimum vector length.
19630 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19631 SDValue VecToExtend = Elt0->getOperand(0);
19632 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19633 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19634 return SDValue();
19635
19636 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19637
19638 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19639 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19640 SubvectorIdx);
19641 }
19642
19643 return SDValue();
19644}
19645
19647 SelectionDAG &DAG) {
19648 EVT VT = N->getValueType(0);
19649 SDValue N0 = N->getOperand(0);
19650 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19651 N0.getOpcode() == AArch64ISD::DUP) {
19652 SDValue Op = N0.getOperand(0);
19653 if (VT.getScalarType() == MVT::i32 &&
19654 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19655 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19656 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19657 }
19658
19659 return SDValue();
19660}
19661
19662// Check an node is an extend or shift operand
19664 unsigned Opcode = N.getOpcode();
19665 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19666 EVT SrcVT;
19667 if (Opcode == ISD::SIGN_EXTEND_INREG)
19668 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19669 else
19670 SrcVT = N.getOperand(0).getValueType();
19671
19672 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19673 } else if (Opcode == ISD::AND) {
19674 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19675 if (!CSD)
19676 return false;
19677 uint64_t AndMask = CSD->getZExtValue();
19678 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19679 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19680 return isa<ConstantSDNode>(N.getOperand(1));
19681 }
19682
19683 return false;
19684}
19685
19686// (N - Y) + Z --> (Z - Y) + N
19687// when N is an extend or shift operand
19689 SelectionDAG &DAG) {
19690 auto IsOneUseExtend = [](SDValue N) {
19691 return N.hasOneUse() && isExtendOrShiftOperand(N);
19692 };
19693
19694 // DAGCombiner will revert the combination when Z is constant cause
19695 // dead loop. So don't enable the combination when Z is constant.
19696 // If Z is one use shift C, we also can't do the optimization.
19697 // It will falling to self infinite loop.
19698 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19699 return SDValue();
19700
19701 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19702 return SDValue();
19703
19704 SDValue Shift = SUB.getOperand(0);
19705 if (!IsOneUseExtend(Shift))
19706 return SDValue();
19707
19708 SDLoc DL(N);
19709 EVT VT = N->getValueType(0);
19710
19711 SDValue Y = SUB.getOperand(1);
19712 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19713 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19714}
19715
19717 SelectionDAG &DAG) {
19718 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19719 // commutative.
19720 if (N->getOpcode() != ISD::ADD)
19721 return SDValue();
19722
19723 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19724 // shifted register is only available for i32 and i64.
19725 EVT VT = N->getValueType(0);
19726 if (VT != MVT::i32 && VT != MVT::i64)
19727 return SDValue();
19728
19729 SDLoc DL(N);
19730 SDValue LHS = N->getOperand(0);
19731 SDValue RHS = N->getOperand(1);
19732
19733 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19734 return Val;
19735 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19736 return Val;
19737
19738 uint64_t LHSImm = 0, RHSImm = 0;
19739 // If both operand are shifted by imm and shift amount is not greater than 4
19740 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19741 // on RHS.
19742 //
19743 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19744 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19745 // with LSL (shift > 4). For the rest of processors, this is no-op for
19746 // performance or correctness.
19747 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19748 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19749 RHSImm > 4 && LHS.hasOneUse())
19750 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19751
19752 return SDValue();
19753}
19754
19755// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19756// This reassociates it back to allow the creation of more mls instructions.
19758 if (N->getOpcode() != ISD::SUB)
19759 return SDValue();
19760
19761 SDValue Add = N->getOperand(1);
19762 SDValue X = N->getOperand(0);
19763 if (Add.getOpcode() != ISD::ADD)
19764 return SDValue();
19765
19766 if (!Add.hasOneUse())
19767 return SDValue();
19769 return SDValue();
19770
19771 SDValue M1 = Add.getOperand(0);
19772 SDValue M2 = Add.getOperand(1);
19773 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19774 M1.getOpcode() != AArch64ISD::UMULL)
19775 return SDValue();
19776 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19778 return SDValue();
19779
19780 EVT VT = N->getValueType(0);
19781 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19782 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19783}
19784
19785// Combine into mla/mls.
19786// This works on the patterns of:
19787// add v1, (mul v2, v3)
19788// sub v1, (mul v2, v3)
19789// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19790// It will transform the add/sub to a scalable version, so that we can
19791// make use of SVE's MLA/MLS that will be generated for that pattern
19792static SDValue
19794 SelectionDAG &DAG = DCI.DAG;
19795 // Make sure that the types are legal
19796 if (!DCI.isAfterLegalizeDAG())
19797 return SDValue();
19798 // Before using SVE's features, check first if it's available.
19799 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19800 return SDValue();
19801
19802 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19803 return SDValue();
19804
19805 if (!N->getValueType(0).isFixedLengthVector())
19806 return SDValue();
19807
19808 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19809 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19810 return SDValue();
19811
19812 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19813 return SDValue();
19814
19815 SDValue MulValue = Op1->getOperand(0);
19816 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19817 return SDValue();
19818
19819 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19820 return SDValue();
19821
19822 EVT ScalableVT = MulValue.getValueType();
19823 if (!ScalableVT.isScalableVector())
19824 return SDValue();
19825
19826 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19827 SDValue NewValue =
19828 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19829 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19830 };
19831
19832 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19833 return res;
19834 else if (N->getOpcode() == ISD::ADD)
19835 return performOpt(N->getOperand(1), N->getOperand(0));
19836
19837 return SDValue();
19838}
19839
19840// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19841// help, for example, to produce ssra from sshr+add.
19843 EVT VT = N->getValueType(0);
19844 if (VT != MVT::i64)
19845 return SDValue();
19846 SDValue Op0 = N->getOperand(0);
19847 SDValue Op1 = N->getOperand(1);
19848
19849 // At least one of the operands should be an extract, and the other should be
19850 // something that is easy to convert to v1i64 type (in this case a load).
19851 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19852 Op0.getOpcode() != ISD::LOAD)
19853 return SDValue();
19854 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19855 Op1.getOpcode() != ISD::LOAD)
19856 return SDValue();
19857
19858 SDLoc DL(N);
19859 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19860 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19861 Op0 = Op0.getOperand(0);
19862 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19863 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19864 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19865 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19866 Op1 = Op1.getOperand(0);
19867 } else
19868 return SDValue();
19869
19870 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19871 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19872 DAG.getConstant(0, DL, MVT::i64));
19873}
19874
19877 if (!BV->hasOneUse())
19878 return false;
19879 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19880 if (!Ld || !Ld->isSimple())
19881 return false;
19882 Loads.push_back(Ld);
19883 return true;
19884 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19886 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19887 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19888 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19889 return false;
19890 Loads.push_back(Ld);
19891 }
19892 return true;
19893 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19894 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19895 // are lowered. Note that this only comes up because we do not always visit
19896 // operands before uses. After that is fixed this can be removed and in the
19897 // meantime this is fairly specific to the lowering we expect from IR.
19898 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19899 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19900 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19901 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19902 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19903 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19904 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19905 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19906 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19907 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19908 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19909 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19910 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19911 B.getOperand(1).getNumOperands() != 4)
19912 return false;
19913 auto SV1 = cast<ShuffleVectorSDNode>(B);
19914 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19915 int NumElts = B.getValueType().getVectorNumElements();
19916 int NumSubElts = NumElts / 4;
19917 for (int I = 0; I < NumSubElts; I++) {
19918 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19919 if (SV1->getMaskElt(I) != I ||
19920 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19921 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19922 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19923 return false;
19924 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19925 if (SV2->getMaskElt(I) != I ||
19926 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19927 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19928 return false;
19929 }
19930 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19931 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19932 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19933 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19934 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19935 !Ld2->isSimple() || !Ld3->isSimple())
19936 return false;
19937 Loads.push_back(Ld0);
19938 Loads.push_back(Ld1);
19939 Loads.push_back(Ld2);
19940 Loads.push_back(Ld3);
19941 return true;
19942 }
19943 return false;
19944}
19945
19947 SelectionDAG &DAG,
19948 unsigned &NumSubLoads) {
19949 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19950 return false;
19951
19952 SmallVector<LoadSDNode *> Loads0, Loads1;
19953 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19954 isLoadOrMultipleLoads(Op1, Loads1)) {
19955 if (NumSubLoads && Loads0.size() != NumSubLoads)
19956 return false;
19957 NumSubLoads = Loads0.size();
19958 return Loads0.size() == Loads1.size() &&
19959 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19960 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19961 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19962 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19963 Size / 8, 1);
19964 });
19965 }
19966
19967 if (Op0.getOpcode() != Op1.getOpcode())
19968 return false;
19969
19970 switch (Op0.getOpcode()) {
19971 case ISD::ADD:
19972 case ISD::SUB:
19974 DAG, NumSubLoads) &&
19976 DAG, NumSubLoads);
19977 case ISD::SIGN_EXTEND:
19978 case ISD::ANY_EXTEND:
19979 case ISD::ZERO_EXTEND:
19980 EVT XVT = Op0.getOperand(0).getValueType();
19981 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19982 XVT.getScalarSizeInBits() != 32)
19983 return false;
19985 DAG, NumSubLoads);
19986 }
19987 return false;
19988}
19989
19990// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19991// into a single load of twice the size, that we extract the bottom part and top
19992// part so that the shl can use a shll2 instruction. The two loads in that
19993// example can also be larger trees of instructions, which are identical except
19994// for the leaves which are all loads offset from the LHS, including
19995// buildvectors of multiple loads. For example the RHS tree could be
19996// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19997// Whilst it can be common for the larger loads to replace LDP instructions
19998// (which doesn't gain anything on it's own), the larger loads can help create
19999// more efficient code, and in buildvectors prevent the need for ld1 lane
20000// inserts which can be slower than normal loads.
20002 EVT VT = N->getValueType(0);
20003 if (!VT.isFixedLengthVector() ||
20004 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
20005 VT.getScalarSizeInBits() != 64))
20006 return SDValue();
20007
20008 SDValue Other = N->getOperand(0);
20009 SDValue Shift = N->getOperand(1);
20010 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
20011 std::swap(Shift, Other);
20012 APInt ShiftAmt;
20013 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
20014 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
20015 return SDValue();
20016
20017 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
20018 !ISD::isExtOpcode(Other.getOpcode()) ||
20019 Shift.getOperand(0).getOperand(0).getValueType() !=
20020 Other.getOperand(0).getValueType() ||
20021 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
20022 return SDValue();
20023
20024 SDValue Op0 = Other.getOperand(0);
20025 SDValue Op1 = Shift.getOperand(0).getOperand(0);
20026
20027 unsigned NumSubLoads = 0;
20028 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20029 return SDValue();
20030
20031 // Attempt to rule out some unprofitable cases using heuristics (some working
20032 // around suboptimal code generation), notably if the extend not be able to
20033 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20034 // will need to be created which can increase the instruction count.
20035 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20036 unsigned NumSubElts = NumElts / NumSubLoads;
20037 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20038 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20039 Op0.getValueType().getSizeInBits() < 128 &&
20041 return SDValue();
20042
20043 // Recreate the tree with the new combined loads.
20044 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20045 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20046 EVT DVT =
20048
20049 SmallVector<LoadSDNode *> Loads0, Loads1;
20050 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20051 isLoadOrMultipleLoads(Op1, Loads1)) {
20052 EVT LoadVT = EVT::getVectorVT(
20053 *DAG.getContext(), Op0.getValueType().getScalarType(),
20054 Op0.getValueType().getVectorNumElements() / Loads0.size());
20055 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20056
20057 SmallVector<SDValue> NewLoads;
20058 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20059 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20060 L0->getBasePtr(), L0->getPointerInfo(),
20061 L0->getOriginalAlign());
20062 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20063 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20064 NewLoads.push_back(Load);
20065 }
20066 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20067 }
20068
20070 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20071 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20072 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20073 };
20074 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20075
20076 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20077 int Hi = NumSubElts, Lo = 0;
20078 for (unsigned i = 0; i < NumSubLoads; i++) {
20079 for (unsigned j = 0; j < NumSubElts; j++) {
20080 LowMask[i * NumSubElts + j] = Lo++;
20081 HighMask[i * NumSubElts + j] = Hi++;
20082 }
20083 Lo += NumSubElts;
20084 Hi += NumSubElts;
20085 }
20086 SDLoc DL(N);
20087 SDValue Ext0, Ext1;
20088 // Extract the top and bottom lanes, then extend the result. Possibly extend
20089 // the result then extract the lanes if the two operands match as it produces
20090 // slightly smaller code.
20091 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20093 NewOp, DAG.getConstant(0, DL, MVT::i64));
20094 SDValue SubH =
20095 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20096 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20097 SDValue Extr0 =
20098 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20099 SDValue Extr1 =
20100 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20101 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20102 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20103 } else {
20105 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20106 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20107 DAG.getConstant(0, DL, MVT::i64));
20108 SDValue SubH =
20109 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20110 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20111 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20112 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20113 }
20114 SDValue NShift =
20115 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20116 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20117}
20118
20121 // Try to change sum of two reductions.
20122 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20123 return Val;
20124 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20125 return Val;
20126 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20127 return Val;
20128 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20129 return Val;
20131 return Val;
20133 return Val;
20134 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20135 return Val;
20136 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20137 return Val;
20138 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20139 return Val;
20140
20141 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20142 return Val;
20143
20144 return performAddSubLongCombine(N, DCI);
20145}
20146
20147// Massage DAGs which we can use the high-half "long" operations on into
20148// something isel will recognize better. E.g.
20149//
20150// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20151// (aarch64_neon_umull (extract_high (v2i64 vec)))
20152// (extract_high (v2i64 (dup128 scalar)))))
20153//
20156 SelectionDAG &DAG) {
20157 if (DCI.isBeforeLegalizeOps())
20158 return SDValue();
20159
20160 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20161 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20162 assert(LHS.getValueType().is64BitVector() &&
20163 RHS.getValueType().is64BitVector() &&
20164 "unexpected shape for long operation");
20165
20166 // Either node could be a DUP, but it's not worth doing both of them (you'd
20167 // just as well use the non-high version) so look for a corresponding extract
20168 // operation on the other "wing".
20171 if (!RHS.getNode())
20172 return SDValue();
20175 if (!LHS.getNode())
20176 return SDValue();
20177 } else
20178 return SDValue();
20179
20180 if (IID == Intrinsic::not_intrinsic)
20181 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20182
20183 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20184 N->getOperand(0), LHS, RHS);
20185}
20186
20187static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20188 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20189 unsigned ElemBits = ElemTy.getSizeInBits();
20190
20191 int64_t ShiftAmount;
20192 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20193 APInt SplatValue, SplatUndef;
20194 unsigned SplatBitSize;
20195 bool HasAnyUndefs;
20196 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20197 HasAnyUndefs, ElemBits) ||
20198 SplatBitSize != ElemBits)
20199 return SDValue();
20200
20201 ShiftAmount = SplatValue.getSExtValue();
20202 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20203 ShiftAmount = CVN->getSExtValue();
20204 } else
20205 return SDValue();
20206
20207 // If the shift amount is zero, remove the shift intrinsic.
20208 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20209 return N->getOperand(1);
20210
20211 unsigned Opcode;
20212 bool IsRightShift;
20213 switch (IID) {
20214 default:
20215 llvm_unreachable("Unknown shift intrinsic");
20216 case Intrinsic::aarch64_neon_sqshl:
20217 Opcode = AArch64ISD::SQSHL_I;
20218 IsRightShift = false;
20219 break;
20220 case Intrinsic::aarch64_neon_uqshl:
20221 Opcode = AArch64ISD::UQSHL_I;
20222 IsRightShift = false;
20223 break;
20224 case Intrinsic::aarch64_neon_srshl:
20225 Opcode = AArch64ISD::SRSHR_I;
20226 IsRightShift = true;
20227 break;
20228 case Intrinsic::aarch64_neon_urshl:
20229 Opcode = AArch64ISD::URSHR_I;
20230 IsRightShift = true;
20231 break;
20232 case Intrinsic::aarch64_neon_sqshlu:
20233 Opcode = AArch64ISD::SQSHLU_I;
20234 IsRightShift = false;
20235 break;
20236 case Intrinsic::aarch64_neon_sshl:
20237 case Intrinsic::aarch64_neon_ushl:
20238 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20239 // left shift for positive shift amounts. For negative shifts we can use a
20240 // VASHR/VLSHR as appropiate.
20241 if (ShiftAmount < 0) {
20242 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20244 ShiftAmount = -ShiftAmount;
20245 } else
20246 Opcode = AArch64ISD::VSHL;
20247 IsRightShift = false;
20248 break;
20249 }
20250
20251 EVT VT = N->getValueType(0);
20252 SDValue Op = N->getOperand(1);
20253 SDLoc dl(N);
20254 if (VT == MVT::i64) {
20255 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20256 VT = MVT::v1i64;
20257 }
20258
20259 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20260 Op = DAG.getNode(Opcode, dl, VT, Op,
20261 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20262 if (N->getValueType(0) == MVT::i64)
20263 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20264 DAG.getConstant(0, dl, MVT::i64));
20265 return Op;
20266 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20267 Op = DAG.getNode(Opcode, dl, VT, Op,
20268 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20269 if (N->getValueType(0) == MVT::i64)
20270 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20271 DAG.getConstant(0, dl, MVT::i64));
20272 return Op;
20273 }
20274
20275 return SDValue();
20276}
20277
20278// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20279// the intrinsics must be legal and take an i32, this means there's almost
20280// certainly going to be a zext in the DAG which we can eliminate.
20281static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20282 SDValue AndN = N->getOperand(2);
20283 if (AndN.getOpcode() != ISD::AND)
20284 return SDValue();
20285
20286 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20287 if (!CMask || CMask->getZExtValue() != Mask)
20288 return SDValue();
20289
20290 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20291 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20292}
20293
20295 SelectionDAG &DAG) {
20296 SDLoc dl(N);
20297 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20298 DAG.getNode(Opc, dl,
20299 N->getOperand(1).getSimpleValueType(),
20300 N->getOperand(1)),
20301 DAG.getConstant(0, dl, MVT::i64));
20302}
20303
20305 SDLoc DL(N);
20306 SDValue Op1 = N->getOperand(1);
20307 SDValue Op2 = N->getOperand(2);
20308 EVT ScalarTy = Op2.getValueType();
20309 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20310 ScalarTy = MVT::i32;
20311
20312 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20313 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20314 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20315 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20316 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20317 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20318}
20319
20321 SDLoc dl(N);
20322 SDValue Scalar = N->getOperand(3);
20323 EVT ScalarTy = Scalar.getValueType();
20324
20325 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20326 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20327
20328 SDValue Passthru = N->getOperand(1);
20329 SDValue Pred = N->getOperand(2);
20330 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20331 Pred, Scalar, Passthru);
20332}
20333
20335 SDLoc dl(N);
20336 LLVMContext &Ctx = *DAG.getContext();
20337 EVT VT = N->getValueType(0);
20338
20339 assert(VT.isScalableVector() && "Expected a scalable vector.");
20340
20341 // Current lowering only supports the SVE-ACLE types.
20343 return SDValue();
20344
20345 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20346 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20347 EVT ByteVT =
20348 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20349
20350 // Convert everything to the domain of EXT (i.e bytes).
20351 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20352 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20353 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20354 DAG.getConstant(ElemSize, dl, MVT::i32));
20355
20356 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20357 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20358}
20359
20362 SelectionDAG &DAG) {
20363 if (DCI.isBeforeLegalize())
20364 return SDValue();
20365
20366 SDValue Comparator = N->getOperand(3);
20367 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20368 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20369 unsigned IID = getIntrinsicID(N);
20370 EVT VT = N->getValueType(0);
20371 EVT CmpVT = N->getOperand(2).getValueType();
20372 SDValue Pred = N->getOperand(1);
20373 SDValue Imm;
20374 SDLoc DL(N);
20375
20376 switch (IID) {
20377 default:
20378 llvm_unreachable("Called with wrong intrinsic!");
20379 break;
20380
20381 // Signed comparisons
20382 case Intrinsic::aarch64_sve_cmpeq_wide:
20383 case Intrinsic::aarch64_sve_cmpne_wide:
20384 case Intrinsic::aarch64_sve_cmpge_wide:
20385 case Intrinsic::aarch64_sve_cmpgt_wide:
20386 case Intrinsic::aarch64_sve_cmplt_wide:
20387 case Intrinsic::aarch64_sve_cmple_wide: {
20388 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20389 int64_t ImmVal = CN->getSExtValue();
20390 if (ImmVal >= -16 && ImmVal <= 15)
20391 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20392 else
20393 return SDValue();
20394 }
20395 break;
20396 }
20397 // Unsigned comparisons
20398 case Intrinsic::aarch64_sve_cmphs_wide:
20399 case Intrinsic::aarch64_sve_cmphi_wide:
20400 case Intrinsic::aarch64_sve_cmplo_wide:
20401 case Intrinsic::aarch64_sve_cmpls_wide: {
20402 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20403 uint64_t ImmVal = CN->getZExtValue();
20404 if (ImmVal <= 127)
20405 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20406 else
20407 return SDValue();
20408 }
20409 break;
20410 }
20411 }
20412
20413 if (!Imm)
20414 return SDValue();
20415
20416 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20417 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20418 N->getOperand(2), Splat, DAG.getCondCode(CC));
20419 }
20420
20421 return SDValue();
20422}
20423
20426 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20427
20428 SDLoc DL(Op);
20429 assert(Op.getValueType().isScalableVector() &&
20430 TLI.isTypeLegal(Op.getValueType()) &&
20431 "Expected legal scalable vector type!");
20432 assert(Op.getValueType() == Pg.getValueType() &&
20433 "Expected same type for PTEST operands");
20434
20435 // Ensure target specific opcodes are using legal type.
20436 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20437 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20438 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20439
20440 // Ensure operands have type nxv16i1.
20441 if (Op.getValueType() != MVT::nxv16i1) {
20444 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20445 else
20446 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20447 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20448 }
20449
20450 // Set condition code (CC) flags.
20451 SDValue Test = DAG.getNode(
20453 DL, MVT::Other, Pg, Op);
20454
20455 // Convert CC to integer based on requested condition.
20456 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20457 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20458 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20459 return DAG.getZExtOrTrunc(Res, DL, VT);
20460}
20461
20463 SelectionDAG &DAG) {
20464 SDLoc DL(N);
20465
20466 SDValue Pred = N->getOperand(1);
20467 SDValue VecToReduce = N->getOperand(2);
20468
20469 // NOTE: The integer reduction's result type is not always linked to the
20470 // operand's element type so we construct it from the intrinsic's result type.
20471 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20472 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20473
20474 // SVE reductions set the whole vector register with the first element
20475 // containing the reduction result, which we'll now extract.
20476 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20477 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20478 Zero);
20479}
20480
20482 SelectionDAG &DAG) {
20483 SDLoc DL(N);
20484
20485 SDValue Pred = N->getOperand(1);
20486 SDValue VecToReduce = N->getOperand(2);
20487
20488 EVT ReduceVT = VecToReduce.getValueType();
20489 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20490
20491 // SVE reductions set the whole vector register with the first element
20492 // containing the reduction result, which we'll now extract.
20493 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20494 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20495 Zero);
20496}
20497
20499 SelectionDAG &DAG) {
20500 SDLoc DL(N);
20501
20502 SDValue Pred = N->getOperand(1);
20503 SDValue InitVal = N->getOperand(2);
20504 SDValue VecToReduce = N->getOperand(3);
20505 EVT ReduceVT = VecToReduce.getValueType();
20506
20507 // Ordered reductions use the first lane of the result vector as the
20508 // reduction's initial value.
20509 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20510 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20511 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20512
20513 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20514
20515 // SVE reductions set the whole vector register with the first element
20516 // containing the reduction result, which we'll now extract.
20517 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20518 Zero);
20519}
20520
20521// If a merged operation has no inactive lanes we can relax it to a predicated
20522// or unpredicated operation, which potentially allows better isel (perhaps
20523// using immediate forms) or relaxing register reuse requirements.
20525 SelectionDAG &DAG, bool UnpredOp = false,
20526 bool SwapOperands = false) {
20527 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20528 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20529 SDValue Pg = N->getOperand(1);
20530 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20531 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20532
20533 // ISD way to specify an all active predicate.
20534 if (isAllActivePredicate(DAG, Pg)) {
20535 if (UnpredOp)
20536 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20537
20538 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20539 }
20540
20541 // FUTURE: SplatVector(true)
20542 return SDValue();
20543}
20544
20547 const AArch64Subtarget *Subtarget) {
20548 SelectionDAG &DAG = DCI.DAG;
20549 unsigned IID = getIntrinsicID(N);
20550 switch (IID) {
20551 default:
20552 break;
20553 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20554 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20555 return tryCombineFixedPointConvert(N, DCI, DAG);
20556 case Intrinsic::aarch64_neon_saddv:
20558 case Intrinsic::aarch64_neon_uaddv:
20560 case Intrinsic::aarch64_neon_sminv:
20562 case Intrinsic::aarch64_neon_uminv:
20564 case Intrinsic::aarch64_neon_smaxv:
20566 case Intrinsic::aarch64_neon_umaxv:
20568 case Intrinsic::aarch64_neon_fmax:
20569 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20570 N->getOperand(1), N->getOperand(2));
20571 case Intrinsic::aarch64_neon_fmin:
20572 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20573 N->getOperand(1), N->getOperand(2));
20574 case Intrinsic::aarch64_neon_fmaxnm:
20575 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20576 N->getOperand(1), N->getOperand(2));
20577 case Intrinsic::aarch64_neon_fminnm:
20578 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20579 N->getOperand(1), N->getOperand(2));
20580 case Intrinsic::aarch64_neon_smull:
20581 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20582 N->getOperand(1), N->getOperand(2));
20583 case Intrinsic::aarch64_neon_umull:
20584 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20585 N->getOperand(1), N->getOperand(2));
20586 case Intrinsic::aarch64_neon_pmull:
20587 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20588 N->getOperand(1), N->getOperand(2));
20589 case Intrinsic::aarch64_neon_sqdmull:
20590 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20591 case Intrinsic::aarch64_neon_sqshl:
20592 case Intrinsic::aarch64_neon_uqshl:
20593 case Intrinsic::aarch64_neon_sqshlu:
20594 case Intrinsic::aarch64_neon_srshl:
20595 case Intrinsic::aarch64_neon_urshl:
20596 case Intrinsic::aarch64_neon_sshl:
20597 case Intrinsic::aarch64_neon_ushl:
20598 return tryCombineShiftImm(IID, N, DAG);
20599 case Intrinsic::aarch64_neon_sabd:
20600 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20601 N->getOperand(1), N->getOperand(2));
20602 case Intrinsic::aarch64_neon_uabd:
20603 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20604 N->getOperand(1), N->getOperand(2));
20605 case Intrinsic::aarch64_crc32b:
20606 case Intrinsic::aarch64_crc32cb:
20607 return tryCombineCRC32(0xff, N, DAG);
20608 case Intrinsic::aarch64_crc32h:
20609 case Intrinsic::aarch64_crc32ch:
20610 return tryCombineCRC32(0xffff, N, DAG);
20611 case Intrinsic::aarch64_sve_saddv:
20612 // There is no i64 version of SADDV because the sign is irrelevant.
20613 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20615 else
20617 case Intrinsic::aarch64_sve_uaddv:
20619 case Intrinsic::aarch64_sve_smaxv:
20621 case Intrinsic::aarch64_sve_umaxv:
20623 case Intrinsic::aarch64_sve_sminv:
20625 case Intrinsic::aarch64_sve_uminv:
20627 case Intrinsic::aarch64_sve_orv:
20629 case Intrinsic::aarch64_sve_eorv:
20631 case Intrinsic::aarch64_sve_andv:
20633 case Intrinsic::aarch64_sve_index:
20634 return LowerSVEIntrinsicIndex(N, DAG);
20635 case Intrinsic::aarch64_sve_dup:
20636 return LowerSVEIntrinsicDUP(N, DAG);
20637 case Intrinsic::aarch64_sve_dup_x:
20638 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20639 N->getOperand(1));
20640 case Intrinsic::aarch64_sve_ext:
20641 return LowerSVEIntrinsicEXT(N, DAG);
20642 case Intrinsic::aarch64_sve_mul_u:
20643 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20644 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20645 case Intrinsic::aarch64_sve_smulh_u:
20646 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20647 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20648 case Intrinsic::aarch64_sve_umulh_u:
20649 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20650 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20651 case Intrinsic::aarch64_sve_smin_u:
20652 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20653 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20654 case Intrinsic::aarch64_sve_umin_u:
20655 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20656 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20657 case Intrinsic::aarch64_sve_smax_u:
20658 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20659 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20660 case Intrinsic::aarch64_sve_umax_u:
20661 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20662 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20663 case Intrinsic::aarch64_sve_lsl_u:
20664 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20665 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20666 case Intrinsic::aarch64_sve_lsr_u:
20667 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20668 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20669 case Intrinsic::aarch64_sve_asr_u:
20670 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20671 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20672 case Intrinsic::aarch64_sve_fadd_u:
20673 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20674 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20675 case Intrinsic::aarch64_sve_fdiv_u:
20676 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20677 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20678 case Intrinsic::aarch64_sve_fmax_u:
20679 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20680 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20681 case Intrinsic::aarch64_sve_fmaxnm_u:
20682 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20683 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20684 case Intrinsic::aarch64_sve_fmla_u:
20685 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20686 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20687 N->getOperand(2));
20688 case Intrinsic::aarch64_sve_fmin_u:
20689 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20690 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20691 case Intrinsic::aarch64_sve_fminnm_u:
20692 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20693 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20694 case Intrinsic::aarch64_sve_fmul_u:
20695 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20696 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20697 case Intrinsic::aarch64_sve_fsub_u:
20698 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20699 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20700 case Intrinsic::aarch64_sve_add_u:
20701 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20702 N->getOperand(3));
20703 case Intrinsic::aarch64_sve_sub_u:
20704 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20705 N->getOperand(3));
20706 case Intrinsic::aarch64_sve_subr:
20707 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20708 case Intrinsic::aarch64_sve_and_u:
20709 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20710 N->getOperand(3));
20711 case Intrinsic::aarch64_sve_bic_u:
20712 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20713 N->getOperand(2), N->getOperand(3));
20714 case Intrinsic::aarch64_sve_eor_u:
20715 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20716 N->getOperand(3));
20717 case Intrinsic::aarch64_sve_orr_u:
20718 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20719 N->getOperand(3));
20720 case Intrinsic::aarch64_sve_sabd_u:
20721 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20722 N->getOperand(2), N->getOperand(3));
20723 case Intrinsic::aarch64_sve_uabd_u:
20724 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20725 N->getOperand(2), N->getOperand(3));
20726 case Intrinsic::aarch64_sve_sdiv_u:
20727 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20728 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20729 case Intrinsic::aarch64_sve_udiv_u:
20730 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20731 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20732 case Intrinsic::aarch64_sve_sqadd:
20733 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20734 case Intrinsic::aarch64_sve_sqsub_u:
20735 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20736 N->getOperand(2), N->getOperand(3));
20737 case Intrinsic::aarch64_sve_uqadd:
20738 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20739 case Intrinsic::aarch64_sve_uqsub_u:
20740 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20741 N->getOperand(2), N->getOperand(3));
20742 case Intrinsic::aarch64_sve_sqadd_x:
20743 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20744 N->getOperand(1), N->getOperand(2));
20745 case Intrinsic::aarch64_sve_sqsub_x:
20746 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20747 N->getOperand(1), N->getOperand(2));
20748 case Intrinsic::aarch64_sve_uqadd_x:
20749 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20750 N->getOperand(1), N->getOperand(2));
20751 case Intrinsic::aarch64_sve_uqsub_x:
20752 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20753 N->getOperand(1), N->getOperand(2));
20754 case Intrinsic::aarch64_sve_asrd:
20755 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20756 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20757 case Intrinsic::aarch64_sve_cmphs:
20758 if (!N->getOperand(2).getValueType().isFloatingPoint())
20760 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20761 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20762 break;
20763 case Intrinsic::aarch64_sve_cmphi:
20764 if (!N->getOperand(2).getValueType().isFloatingPoint())
20766 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20767 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20768 break;
20769 case Intrinsic::aarch64_sve_fcmpge:
20770 case Intrinsic::aarch64_sve_cmpge:
20772 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20773 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20774 break;
20775 case Intrinsic::aarch64_sve_fcmpgt:
20776 case Intrinsic::aarch64_sve_cmpgt:
20778 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20779 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20780 break;
20781 case Intrinsic::aarch64_sve_fcmpeq:
20782 case Intrinsic::aarch64_sve_cmpeq:
20784 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20785 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20786 break;
20787 case Intrinsic::aarch64_sve_fcmpne:
20788 case Intrinsic::aarch64_sve_cmpne:
20790 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20791 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20792 break;
20793 case Intrinsic::aarch64_sve_fcmpuo:
20795 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20796 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20797 break;
20798 case Intrinsic::aarch64_sve_fadda:
20800 case Intrinsic::aarch64_sve_faddv:
20802 case Intrinsic::aarch64_sve_fmaxnmv:
20804 case Intrinsic::aarch64_sve_fmaxv:
20806 case Intrinsic::aarch64_sve_fminnmv:
20808 case Intrinsic::aarch64_sve_fminv:
20810 case Intrinsic::aarch64_sve_sel:
20811 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20812 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20813 case Intrinsic::aarch64_sve_cmpeq_wide:
20814 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20815 case Intrinsic::aarch64_sve_cmpne_wide:
20816 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20817 case Intrinsic::aarch64_sve_cmpge_wide:
20818 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20819 case Intrinsic::aarch64_sve_cmpgt_wide:
20820 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20821 case Intrinsic::aarch64_sve_cmplt_wide:
20822 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20823 case Intrinsic::aarch64_sve_cmple_wide:
20824 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20825 case Intrinsic::aarch64_sve_cmphs_wide:
20826 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20827 case Intrinsic::aarch64_sve_cmphi_wide:
20828 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20829 case Intrinsic::aarch64_sve_cmplo_wide:
20830 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20831 case Intrinsic::aarch64_sve_cmpls_wide:
20832 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20833 case Intrinsic::aarch64_sve_ptest_any:
20834 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20836 case Intrinsic::aarch64_sve_ptest_first:
20837 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20839 case Intrinsic::aarch64_sve_ptest_last:
20840 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20842 }
20843 return SDValue();
20844}
20845
20846static bool isCheapToExtend(const SDValue &N) {
20847 unsigned OC = N->getOpcode();
20848 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20850}
20851
20852static SDValue
20854 SelectionDAG &DAG) {
20855 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20856 // we can move the sext into the arguments and have the same result. For
20857 // example, if A and B are both loads, we can make those extending loads and
20858 // avoid an extra instruction. This pattern appears often in VLS code
20859 // generation where the inputs to the setcc have a different size to the
20860 // instruction that wants to use the result of the setcc.
20861 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20862 N->getOperand(0)->getOpcode() == ISD::SETCC);
20863 const SDValue SetCC = N->getOperand(0);
20864
20865 const SDValue CCOp0 = SetCC.getOperand(0);
20866 const SDValue CCOp1 = SetCC.getOperand(1);
20867 if (!CCOp0->getValueType(0).isInteger() ||
20868 !CCOp1->getValueType(0).isInteger())
20869 return SDValue();
20870
20871 ISD::CondCode Code =
20872 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20873
20874 ISD::NodeType ExtType =
20875 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20876
20877 if (isCheapToExtend(SetCC.getOperand(0)) &&
20878 isCheapToExtend(SetCC.getOperand(1))) {
20879 const SDValue Ext1 =
20880 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20881 const SDValue Ext2 =
20882 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20883
20884 return DAG.getSetCC(
20885 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20886 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20887 }
20888
20889 return SDValue();
20890}
20891
20894 SelectionDAG &DAG) {
20895 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20896 // we can convert that DUP into another extract_high (of a bigger DUP), which
20897 // helps the backend to decide that an sabdl2 would be useful, saving a real
20898 // extract_high operation.
20899 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20900 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20901 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20902 SDNode *ABDNode = N->getOperand(0).getNode();
20903 SDValue NewABD =
20905 if (!NewABD.getNode())
20906 return SDValue();
20907
20908 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20909 }
20910
20911 if (N->getValueType(0).isFixedLengthVector() &&
20912 N->getOpcode() == ISD::SIGN_EXTEND &&
20913 N->getOperand(0)->getOpcode() == ISD::SETCC)
20914 return performSignExtendSetCCCombine(N, DCI, DAG);
20915
20916 return SDValue();
20917}
20918
20920 SDValue SplatVal, unsigned NumVecElts) {
20921 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20922 Align OrigAlignment = St.getAlign();
20923 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20924
20925 // Create scalar stores. This is at least as good as the code sequence for a
20926 // split unaligned store which is a dup.s, ext.b, and two stores.
20927 // Most of the time the three stores should be replaced by store pair
20928 // instructions (stp).
20929 SDLoc DL(&St);
20930 SDValue BasePtr = St.getBasePtr();
20931 uint64_t BaseOffset = 0;
20932
20933 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20934 SDValue NewST1 =
20935 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20936 OrigAlignment, St.getMemOperand()->getFlags());
20937
20938 // As this in ISel, we will not merge this add which may degrade results.
20939 if (BasePtr->getOpcode() == ISD::ADD &&
20940 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20941 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20942 BasePtr = BasePtr->getOperand(0);
20943 }
20944
20945 unsigned Offset = EltOffset;
20946 while (--NumVecElts) {
20947 Align Alignment = commonAlignment(OrigAlignment, Offset);
20948 SDValue OffsetPtr =
20949 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20950 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20951 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20952 PtrInfo.getWithOffset(Offset), Alignment,
20953 St.getMemOperand()->getFlags());
20954 Offset += EltOffset;
20955 }
20956 return NewST1;
20957}
20958
20959// Returns an SVE type that ContentTy can be trivially sign or zero extended
20960// into.
20961static MVT getSVEContainerType(EVT ContentTy) {
20962 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20963
20964 switch (ContentTy.getSimpleVT().SimpleTy) {
20965 default:
20966 llvm_unreachable("No known SVE container for this MVT type");
20967 case MVT::nxv2i8:
20968 case MVT::nxv2i16:
20969 case MVT::nxv2i32:
20970 case MVT::nxv2i64:
20971 case MVT::nxv2f32:
20972 case MVT::nxv2f64:
20973 return MVT::nxv2i64;
20974 case MVT::nxv4i8:
20975 case MVT::nxv4i16:
20976 case MVT::nxv4i32:
20977 case MVT::nxv4f32:
20978 return MVT::nxv4i32;
20979 case MVT::nxv8i8:
20980 case MVT::nxv8i16:
20981 case MVT::nxv8f16:
20982 case MVT::nxv8bf16:
20983 return MVT::nxv8i16;
20984 case MVT::nxv16i8:
20985 return MVT::nxv16i8;
20986 }
20987}
20988
20989static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20990 SDLoc DL(N);
20991 EVT VT = N->getValueType(0);
20992
20994 return SDValue();
20995
20996 EVT ContainerVT = VT;
20997 if (ContainerVT.isInteger())
20998 ContainerVT = getSVEContainerType(ContainerVT);
20999
21000 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
21001 SDValue Ops[] = { N->getOperand(0), // Chain
21002 N->getOperand(2), // Pg
21003 N->getOperand(3), // Base
21004 DAG.getValueType(VT) };
21005
21006 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21007 SDValue LoadChain = SDValue(Load.getNode(), 1);
21008
21009 if (ContainerVT.isInteger() && (VT != ContainerVT))
21010 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21011
21012 return DAG.getMergeValues({ Load, LoadChain }, DL);
21013}
21014
21016 SDLoc DL(N);
21017 EVT VT = N->getValueType(0);
21018 EVT PtrTy = N->getOperand(3).getValueType();
21019
21020 EVT LoadVT = VT;
21021 if (VT.isFloatingPoint())
21022 LoadVT = VT.changeTypeToInteger();
21023
21024 auto *MINode = cast<MemIntrinsicSDNode>(N);
21025 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21026 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21027 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21028 MINode->getOperand(2), PassThru,
21029 MINode->getMemoryVT(), MINode->getMemOperand(),
21031
21032 if (VT.isFloatingPoint()) {
21033 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21034 return DAG.getMergeValues(Ops, DL);
21035 }
21036
21037 return L;
21038}
21039
21040template <unsigned Opcode>
21042 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21044 "Unsupported opcode.");
21045 SDLoc DL(N);
21046 EVT VT = N->getValueType(0);
21047
21048 EVT LoadVT = VT;
21049 if (VT.isFloatingPoint())
21050 LoadVT = VT.changeTypeToInteger();
21051
21052 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21053 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21054 SDValue LoadChain = SDValue(Load.getNode(), 1);
21055
21056 if (VT.isFloatingPoint())
21057 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21058
21059 return DAG.getMergeValues({Load, LoadChain}, DL);
21060}
21061
21063 SDLoc DL(N);
21064 SDValue Data = N->getOperand(2);
21065 EVT DataVT = Data.getValueType();
21066 EVT HwSrcVt = getSVEContainerType(DataVT);
21067 SDValue InputVT = DAG.getValueType(DataVT);
21068
21069 if (DataVT.isFloatingPoint())
21070 InputVT = DAG.getValueType(HwSrcVt);
21071
21072 SDValue SrcNew;
21073 if (Data.getValueType().isFloatingPoint())
21074 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21075 else
21076 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21077
21078 SDValue Ops[] = { N->getOperand(0), // Chain
21079 SrcNew,
21080 N->getOperand(4), // Base
21081 N->getOperand(3), // Pg
21082 InputVT
21083 };
21084
21085 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21086}
21087
21089 SDLoc DL(N);
21090
21091 SDValue Data = N->getOperand(2);
21092 EVT DataVT = Data.getValueType();
21093 EVT PtrTy = N->getOperand(4).getValueType();
21094
21095 if (DataVT.isFloatingPoint())
21097
21098 auto *MINode = cast<MemIntrinsicSDNode>(N);
21099 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21100 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21101 MINode->getMemoryVT(), MINode->getMemOperand(),
21102 ISD::UNINDEXED, false, false);
21103}
21104
21105/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21106/// load store optimizer pass will merge them to store pair stores. This should
21107/// be better than a movi to create the vector zero followed by a vector store
21108/// if the zero constant is not re-used, since one instructions and one register
21109/// live range will be removed.
21110///
21111/// For example, the final generated code should be:
21112///
21113/// stp xzr, xzr, [x0]
21114///
21115/// instead of:
21116///
21117/// movi v0.2d, #0
21118/// str q0, [x0]
21119///
21121 SDValue StVal = St.getValue();
21122 EVT VT = StVal.getValueType();
21123
21124 // Avoid scalarizing zero splat stores for scalable vectors.
21125 if (VT.isScalableVector())
21126 return SDValue();
21127
21128 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21129 // 2, 3 or 4 i32 elements.
21130 int NumVecElts = VT.getVectorNumElements();
21131 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21132 VT.getVectorElementType().getSizeInBits() == 64) ||
21133 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21134 VT.getVectorElementType().getSizeInBits() == 32)))
21135 return SDValue();
21136
21137 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21138 return SDValue();
21139
21140 // If the zero constant has more than one use then the vector store could be
21141 // better since the constant mov will be amortized and stp q instructions
21142 // should be able to be formed.
21143 if (!StVal.hasOneUse())
21144 return SDValue();
21145
21146 // If the store is truncating then it's going down to i16 or smaller, which
21147 // means it can be implemented in a single store anyway.
21148 if (St.isTruncatingStore())
21149 return SDValue();
21150
21151 // If the immediate offset of the address operand is too large for the stp
21152 // instruction, then bail out.
21153 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21154 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21155 if (Offset < -512 || Offset > 504)
21156 return SDValue();
21157 }
21158
21159 for (int I = 0; I < NumVecElts; ++I) {
21160 SDValue EltVal = StVal.getOperand(I);
21161 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21162 return SDValue();
21163 }
21164
21165 // Use a CopyFromReg WZR/XZR here to prevent
21166 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21167 SDLoc DL(&St);
21168 unsigned ZeroReg;
21169 EVT ZeroVT;
21170 if (VT.getVectorElementType().getSizeInBits() == 32) {
21171 ZeroReg = AArch64::WZR;
21172 ZeroVT = MVT::i32;
21173 } else {
21174 ZeroReg = AArch64::XZR;
21175 ZeroVT = MVT::i64;
21176 }
21177 SDValue SplatVal =
21178 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21179 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21180}
21181
21182/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21183/// value. The load store optimizer pass will merge them to store pair stores.
21184/// This has better performance than a splat of the scalar followed by a split
21185/// vector store. Even if the stores are not merged it is four stores vs a dup,
21186/// followed by an ext.b and two stores.
21188 SDValue StVal = St.getValue();
21189 EVT VT = StVal.getValueType();
21190
21191 // Don't replace floating point stores, they possibly won't be transformed to
21192 // stp because of the store pair suppress pass.
21193 if (VT.isFloatingPoint())
21194 return SDValue();
21195
21196 // We can express a splat as store pair(s) for 2 or 4 elements.
21197 unsigned NumVecElts = VT.getVectorNumElements();
21198 if (NumVecElts != 4 && NumVecElts != 2)
21199 return SDValue();
21200
21201 // If the store is truncating then it's going down to i16 or smaller, which
21202 // means it can be implemented in a single store anyway.
21203 if (St.isTruncatingStore())
21204 return SDValue();
21205
21206 // Check that this is a splat.
21207 // Make sure that each of the relevant vector element locations are inserted
21208 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21209 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21210 SDValue SplatVal;
21211 for (unsigned I = 0; I < NumVecElts; ++I) {
21212 // Check for insert vector elements.
21213 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21214 return SDValue();
21215
21216 // Check that same value is inserted at each vector element.
21217 if (I == 0)
21218 SplatVal = StVal.getOperand(1);
21219 else if (StVal.getOperand(1) != SplatVal)
21220 return SDValue();
21221
21222 // Check insert element index.
21223 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21224 if (!CIndex)
21225 return SDValue();
21226 uint64_t IndexVal = CIndex->getZExtValue();
21227 if (IndexVal >= NumVecElts)
21228 return SDValue();
21229 IndexNotInserted.reset(IndexVal);
21230
21231 StVal = StVal.getOperand(0);
21232 }
21233 // Check that all vector element locations were inserted to.
21234 if (IndexNotInserted.any())
21235 return SDValue();
21236
21237 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21238}
21239
21241 SelectionDAG &DAG,
21242 const AArch64Subtarget *Subtarget) {
21243
21244 StoreSDNode *S = cast<StoreSDNode>(N);
21245 if (S->isVolatile() || S->isIndexed())
21246 return SDValue();
21247
21248 SDValue StVal = S->getValue();
21249 EVT VT = StVal.getValueType();
21250
21251 if (!VT.isFixedLengthVector())
21252 return SDValue();
21253
21254 // If we get a splat of zeros, convert this vector store to a store of
21255 // scalars. They will be merged into store pairs of xzr thereby removing one
21256 // instruction and one register.
21257 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21258 return ReplacedZeroSplat;
21259
21260 // FIXME: The logic for deciding if an unaligned store should be split should
21261 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21262 // a call to that function here.
21263
21264 if (!Subtarget->isMisaligned128StoreSlow())
21265 return SDValue();
21266
21267 // Don't split at -Oz.
21269 return SDValue();
21270
21271 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21272 // those up regresses performance on micro-benchmarks and olden/bh.
21273 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21274 return SDValue();
21275
21276 // Split unaligned 16B stores. They are terrible for performance.
21277 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21278 // extensions can use this to mark that it does not want splitting to happen
21279 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21280 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21281 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21282 S->getAlign() <= Align(2))
21283 return SDValue();
21284
21285 // If we get a splat of a scalar convert this vector store to a store of
21286 // scalars. They will be merged into store pairs thereby removing two
21287 // instructions.
21288 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21289 return ReplacedSplat;
21290
21291 SDLoc DL(S);
21292
21293 // Split VT into two.
21294 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21295 unsigned NumElts = HalfVT.getVectorNumElements();
21296 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21297 DAG.getConstant(0, DL, MVT::i64));
21298 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21299 DAG.getConstant(NumElts, DL, MVT::i64));
21300 SDValue BasePtr = S->getBasePtr();
21301 SDValue NewST1 =
21302 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21303 S->getAlign(), S->getMemOperand()->getFlags());
21304 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21305 DAG.getConstant(8, DL, MVT::i64));
21306 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21307 S->getPointerInfo(), S->getAlign(),
21308 S->getMemOperand()->getFlags());
21309}
21310
21312 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21313
21314 // splice(pg, op1, undef) -> op1
21315 if (N->getOperand(2).isUndef())
21316 return N->getOperand(1);
21317
21318 return SDValue();
21319}
21320
21322 const AArch64Subtarget *Subtarget) {
21323 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21324 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21325 "Unexpected Opcode!");
21326
21327 // uunpklo/hi undef -> undef
21328 if (N->getOperand(0).isUndef())
21329 return DAG.getUNDEF(N->getValueType(0));
21330
21331 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21332 // extending load. We can do this even if this is already a masked
21333 // {z,}extload.
21334 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21335 N->getOpcode() == AArch64ISD::UUNPKLO) {
21336 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21337 SDValue Mask = MLD->getMask();
21338 SDLoc DL(N);
21339
21340 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21341 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21342 (MLD->getPassThru()->isUndef() ||
21343 isZerosVector(MLD->getPassThru().getNode()))) {
21344 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21345 unsigned PgPattern = Mask->getConstantOperandVal(0);
21346 EVT VT = N->getValueType(0);
21347
21348 // Ensure we can double the size of the predicate pattern
21349 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21350 if (NumElts &&
21351 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21352 Mask =
21353 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21354 SDValue PassThru = DAG.getConstant(0, DL, VT);
21355 SDValue NewLoad = DAG.getMaskedLoad(
21356 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21357 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21359
21360 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21361
21362 return NewLoad;
21363 }
21364 }
21365 }
21366
21367 return SDValue();
21368}
21369
21371 if (N->getOpcode() != AArch64ISD::UZP1)
21372 return false;
21373 SDValue Op0 = N->getOperand(0);
21374 EVT SrcVT = Op0->getValueType(0);
21375 EVT DstVT = N->getValueType(0);
21376 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21377 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21378 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21379}
21380
21381// Try to combine rounding shifts where the operands come from an extend, and
21382// the result is truncated and combined into one vector.
21383// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21385 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21386 SDValue Op0 = N->getOperand(0);
21387 SDValue Op1 = N->getOperand(1);
21388 EVT ResVT = N->getValueType(0);
21389
21390 unsigned RshOpc = Op0.getOpcode();
21391 if (RshOpc != AArch64ISD::RSHRNB_I)
21392 return SDValue();
21393
21394 // Same op code and imm value?
21395 SDValue ShiftValue = Op0.getOperand(1);
21396 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21397 return SDValue();
21398
21399 // Same unextended operand value?
21400 SDValue Lo = Op0.getOperand(0);
21401 SDValue Hi = Op1.getOperand(0);
21402 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21403 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21404 return SDValue();
21405 SDValue OrigArg = Lo.getOperand(0);
21406 if (OrigArg != Hi.getOperand(0))
21407 return SDValue();
21408
21409 SDLoc DL(N);
21410 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21411 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21412 ShiftValue);
21413}
21414
21415// Try to simplify:
21416// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21417// t2 = nxv8i16 srl(t1, ShiftValue)
21418// to
21419// t1 = nxv8i16 rshrnb(X, shiftvalue).
21420// rshrnb will zero the top half bits of each element. Therefore, this combine
21421// should only be performed when a following instruction with the rshrnb
21422// as an operand does not care about the top half of each element. For example,
21423// a uzp1 or a truncating store.
21425 const AArch64Subtarget *Subtarget) {
21426 EVT VT = Srl->getValueType(0);
21427 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21428 return SDValue();
21429
21430 EVT ResVT;
21431 if (VT == MVT::nxv8i16)
21432 ResVT = MVT::nxv16i8;
21433 else if (VT == MVT::nxv4i32)
21434 ResVT = MVT::nxv8i16;
21435 else if (VT == MVT::nxv2i64)
21436 ResVT = MVT::nxv4i32;
21437 else
21438 return SDValue();
21439
21440 SDLoc DL(Srl);
21441 unsigned ShiftValue;
21442 SDValue RShOperand;
21443 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21444 return SDValue();
21445 SDValue Rshrnb = DAG.getNode(
21446 AArch64ISD::RSHRNB_I, DL, ResVT,
21447 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21448 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21449}
21450
21452 const AArch64Subtarget *Subtarget) {
21453 SDLoc DL(N);
21454 SDValue Op0 = N->getOperand(0);
21455 SDValue Op1 = N->getOperand(1);
21456 EVT ResVT = N->getValueType(0);
21457
21458 // uzp1(x, undef) -> concat(truncate(x), undef)
21459 if (Op1.getOpcode() == ISD::UNDEF) {
21460 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21461 switch (ResVT.getSimpleVT().SimpleTy) {
21462 default:
21463 break;
21464 case MVT::v16i8:
21465 BCVT = MVT::v8i16;
21466 HalfVT = MVT::v8i8;
21467 break;
21468 case MVT::v8i16:
21469 BCVT = MVT::v4i32;
21470 HalfVT = MVT::v4i16;
21471 break;
21472 case MVT::v4i32:
21473 BCVT = MVT::v2i64;
21474 HalfVT = MVT::v2i32;
21475 break;
21476 }
21477 if (BCVT != MVT::Other) {
21478 SDValue BC = DAG.getBitcast(BCVT, Op0);
21479 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21480 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21481 DAG.getUNDEF(HalfVT));
21482 }
21483 }
21484
21485 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21486 return Urshr;
21487
21488 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21489 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21490
21491 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21492 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21493
21494 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21495 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21496 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21497 SDValue X = Op0.getOperand(0).getOperand(0);
21498 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21499 }
21500 }
21501
21502 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21503 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21504 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21505 SDValue Z = Op1.getOperand(0).getOperand(1);
21506 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21507 }
21508 }
21509
21510 // These optimizations only work on little endian.
21511 if (!DAG.getDataLayout().isLittleEndian())
21512 return SDValue();
21513
21514 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21515 // Example:
21516 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21517 // to
21518 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21520 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21521 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21522 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21523 Op1.getOperand(0));
21524 }
21525 }
21526
21527 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21528 return SDValue();
21529
21530 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21531 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21532
21533 // truncating uzp1(x, y) -> xtn(concat (x, y))
21534 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21535 EVT Op0Ty = SourceOp0.getValueType();
21536 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21537 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21538 SDValue Concat =
21541 SourceOp0, SourceOp1);
21542 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21543 }
21544 }
21545
21546 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21547 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21548 SourceOp1.getOpcode() != ISD::TRUNCATE)
21549 return SDValue();
21550 SourceOp0 = SourceOp0.getOperand(0);
21551 SourceOp1 = SourceOp1.getOperand(0);
21552
21553 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21554 !SourceOp0.getValueType().isSimple())
21555 return SDValue();
21556
21557 EVT ResultTy;
21558
21559 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21560 case MVT::v2i64:
21561 ResultTy = MVT::v4i32;
21562 break;
21563 case MVT::v4i32:
21564 ResultTy = MVT::v8i16;
21565 break;
21566 case MVT::v8i16:
21567 ResultTy = MVT::v16i8;
21568 break;
21569 default:
21570 return SDValue();
21571 }
21572
21573 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21574 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21575 SDValue UzpResult =
21576 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21577
21578 EVT BitcastResultTy;
21579
21580 switch (ResVT.getSimpleVT().SimpleTy) {
21581 case MVT::v2i32:
21582 BitcastResultTy = MVT::v2i64;
21583 break;
21584 case MVT::v4i16:
21585 BitcastResultTy = MVT::v4i32;
21586 break;
21587 case MVT::v8i8:
21588 BitcastResultTy = MVT::v8i16;
21589 break;
21590 default:
21591 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21592 }
21593
21594 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21595 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21596}
21597
21599 unsigned Opc = N->getOpcode();
21600
21601 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21603 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21605 "Invalid opcode.");
21606
21607 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21609 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21611 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21615
21616 SDLoc DL(N);
21617 SDValue Chain = N->getOperand(0);
21618 SDValue Pg = N->getOperand(1);
21619 SDValue Base = N->getOperand(2);
21620 SDValue Offset = N->getOperand(3);
21621 SDValue Ty = N->getOperand(4);
21622
21623 EVT ResVT = N->getValueType(0);
21624
21625 const auto OffsetOpc = Offset.getOpcode();
21626 const bool OffsetIsZExt =
21628 const bool OffsetIsSExt =
21630
21631 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21632 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21633 SDValue ExtPg = Offset.getOperand(0);
21634 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21635 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21636
21637 // If the predicate for the sign- or zero-extended offset is the
21638 // same as the predicate used for this load and the sign-/zero-extension
21639 // was from a 32-bits...
21640 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21641 SDValue UnextendedOffset = Offset.getOperand(1);
21642
21643 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21644 if (Signed)
21645 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21646
21647 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21648 {Chain, Pg, Base, UnextendedOffset, Ty});
21649 }
21650 }
21651
21652 return SDValue();
21653}
21654
21655/// Optimize a vector shift instruction and its operand if shifted out
21656/// bits are not used.
21658 const AArch64TargetLowering &TLI,
21660 assert(N->getOpcode() == AArch64ISD::VASHR ||
21661 N->getOpcode() == AArch64ISD::VLSHR);
21662
21663 SDValue Op = N->getOperand(0);
21664 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21665
21666 unsigned ShiftImm = N->getConstantOperandVal(1);
21667 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21668
21669 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21670 if (N->getOpcode() == AArch64ISD::VASHR &&
21671 Op.getOpcode() == AArch64ISD::VSHL &&
21672 N->getOperand(1) == Op.getOperand(1))
21673 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21674 return Op.getOperand(0);
21675
21676 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21677 APInt DemandedMask = ~ShiftedOutBits;
21678
21679 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21680 return SDValue(N, 0);
21681
21682 return SDValue();
21683}
21684
21686 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21687 // This transform works in partnership with performSetCCPunpkCombine to
21688 // remove unnecessary transfer of predicates into standard registers and back
21689 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21690 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21691 MVT::i1) {
21692 SDValue CC = N->getOperand(0)->getOperand(0);
21693 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21694 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21695 DAG.getVectorIdxConstant(0, SDLoc(N)));
21696 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21697 }
21698
21699 return SDValue();
21700}
21701
21702/// Target-specific DAG combine function for post-increment LD1 (lane) and
21703/// post-increment LD1R.
21706 bool IsLaneOp) {
21707 if (DCI.isBeforeLegalizeOps())
21708 return SDValue();
21709
21710 SelectionDAG &DAG = DCI.DAG;
21711 EVT VT = N->getValueType(0);
21712
21713 if (!VT.is128BitVector() && !VT.is64BitVector())
21714 return SDValue();
21715
21716 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21717 SDNode *LD = N->getOperand(LoadIdx).getNode();
21718 // If it is not LOAD, can not do such combine.
21719 if (LD->getOpcode() != ISD::LOAD)
21720 return SDValue();
21721
21722 // The vector lane must be a constant in the LD1LANE opcode.
21723 SDValue Lane;
21724 if (IsLaneOp) {
21725 Lane = N->getOperand(2);
21726 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21727 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21728 return SDValue();
21729 }
21730
21731 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21732 EVT MemVT = LoadSDN->getMemoryVT();
21733 // Check if memory operand is the same type as the vector element.
21734 if (MemVT != VT.getVectorElementType())
21735 return SDValue();
21736
21737 // Check if there are other uses. If so, do not combine as it will introduce
21738 // an extra load.
21739 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21740 ++UI) {
21741 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21742 continue;
21743 if (*UI != N)
21744 return SDValue();
21745 }
21746
21747 // If there is one use and it can splat the value, prefer that operation.
21748 // TODO: This could be expanded to more operations if they reliably use the
21749 // index variants.
21750 if (N->hasOneUse()) {
21751 unsigned UseOpc = N->use_begin()->getOpcode();
21752 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21753 return SDValue();
21754 }
21755
21756 SDValue Addr = LD->getOperand(1);
21757 SDValue Vector = N->getOperand(0);
21758 // Search for a use of the address operand that is an increment.
21759 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21760 Addr.getNode()->use_end(); UI != UE; ++UI) {
21761 SDNode *User = *UI;
21762 if (User->getOpcode() != ISD::ADD
21763 || UI.getUse().getResNo() != Addr.getResNo())
21764 continue;
21765
21766 // If the increment is a constant, it must match the memory ref size.
21767 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21768 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21769 uint32_t IncVal = CInc->getZExtValue();
21770 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21771 if (IncVal != NumBytes)
21772 continue;
21773 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21774 }
21775
21776 // To avoid cycle construction make sure that neither the load nor the add
21777 // are predecessors to each other or the Vector.
21780 Visited.insert(Addr.getNode());
21781 Worklist.push_back(User);
21782 Worklist.push_back(LD);
21783 Worklist.push_back(Vector.getNode());
21784 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21785 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21786 continue;
21787
21789 Ops.push_back(LD->getOperand(0)); // Chain
21790 if (IsLaneOp) {
21791 Ops.push_back(Vector); // The vector to be inserted
21792 Ops.push_back(Lane); // The lane to be inserted in the vector
21793 }
21794 Ops.push_back(Addr);
21795 Ops.push_back(Inc);
21796
21797 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21798 SDVTList SDTys = DAG.getVTList(Tys);
21799 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21800 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21801 MemVT,
21802 LoadSDN->getMemOperand());
21803
21804 // Update the uses.
21805 SDValue NewResults[] = {
21806 SDValue(LD, 0), // The result of load
21807 SDValue(UpdN.getNode(), 2) // Chain
21808 };
21809 DCI.CombineTo(LD, NewResults);
21810 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21811 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21812
21813 break;
21814 }
21815 return SDValue();
21816}
21817
21818/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21819/// address translation.
21822 SelectionDAG &DAG) {
21823 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21824 KnownBits Known;
21826 !DCI.isBeforeLegalizeOps());
21827 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21828 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21829 DCI.CommitTargetLoweringOpt(TLO);
21830 return true;
21831 }
21832 return false;
21833}
21834
21836 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21837 "Expected STORE dag node in input!");
21838
21839 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21840 if (!Store->isTruncatingStore() || Store->isIndexed())
21841 return SDValue();
21842 SDValue Ext = Store->getValue();
21843 auto ExtOpCode = Ext.getOpcode();
21844 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21845 ExtOpCode != ISD::ANY_EXTEND)
21846 return SDValue();
21847 SDValue Orig = Ext->getOperand(0);
21848 if (Store->getMemoryVT() != Orig.getValueType())
21849 return SDValue();
21850 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21851 Store->getBasePtr(), Store->getMemOperand());
21852 }
21853
21854 return SDValue();
21855}
21856
21857// A custom combine to lower load <3 x i8> as the more efficient sequence
21858// below:
21859// ldrb wX, [x0, #2]
21860// ldrh wY, [x0]
21861// orr wX, wY, wX, lsl #16
21862// fmov s0, wX
21863//
21864// Note that an alternative sequence with even fewer (although usually more
21865// complex/expensive) instructions would be:
21866// ld1r.4h { v0 }, [x0], #2
21867// ld1.b { v0 }[2], [x0]
21868//
21869// Generating this sequence unfortunately results in noticeably worse codegen
21870// for code that extends the loaded v3i8, due to legalization breaking vector
21871// shuffle detection in a way that is very difficult to work around.
21872// TODO: Revisit once v3i8 legalization has been improved in general.
21874 EVT MemVT = LD->getMemoryVT();
21875 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21876 LD->getOriginalAlign() >= 4)
21877 return SDValue();
21878
21879 SDLoc DL(LD);
21881 SDValue Chain = LD->getChain();
21882 SDValue BasePtr = LD->getBasePtr();
21883 MachineMemOperand *MMO = LD->getMemOperand();
21884 assert(LD->getOffset().isUndef() && "undef offset expected");
21885
21886 // Load 2 x i8, then 1 x i8.
21887 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21888 TypeSize Offset2 = TypeSize::getFixed(2);
21889 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21890 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21891 MF.getMachineMemOperand(MMO, 2, 1));
21892
21893 // Extend to i32.
21894 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21895 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21896
21897 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21898 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21899 DAG.getConstant(16, DL, MVT::i32));
21900 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21901 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21902
21903 // Extract v3i8 again.
21904 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21905 DAG.getConstant(0, DL, MVT::i64));
21906 SDValue TokenFactor = DAG.getNode(
21907 ISD::TokenFactor, DL, MVT::Other,
21908 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21909 return DAG.getMergeValues({Extract, TokenFactor}, DL);
21910}
21911
21912// Perform TBI simplification if supported by the target and try to break up
21913// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21914// load instructions can be selected.
21917 SelectionDAG &DAG,
21918 const AArch64Subtarget *Subtarget) {
21919 if (Subtarget->supportsAddressTopByteIgnored())
21920 performTBISimplification(N->getOperand(1), DCI, DAG);
21921
21922 LoadSDNode *LD = cast<LoadSDNode>(N);
21923 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21924 return SDValue(N, 0);
21925
21926 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21927 return Res;
21928
21929 if (!LD->isNonTemporal())
21930 return SDValue(N, 0);
21931
21932 EVT MemVT = LD->getMemoryVT();
21933 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21934 MemVT.getSizeInBits() % 256 == 0 ||
21935 256 % MemVT.getScalarSizeInBits() != 0)
21936 return SDValue(N, 0);
21937
21938 SDLoc DL(LD);
21939 SDValue Chain = LD->getChain();
21940 SDValue BasePtr = LD->getBasePtr();
21941 SDNodeFlags Flags = LD->getFlags();
21943 SmallVector<SDValue, 4> LoadOpsChain;
21944 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21945 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21946 // loads and reduce the amount of load instructions generated.
21947 MVT NewVT =
21949 256 / MemVT.getVectorElementType().getSizeInBits());
21950 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21951 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21952 for (unsigned I = 0; I < Num256Loads; I++) {
21953 unsigned PtrOffset = I * 32;
21954 SDValue NewPtr = DAG.getMemBasePlusOffset(
21955 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21956 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21957 SDValue NewLoad = DAG.getLoad(
21958 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21959 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21960 LoadOps.push_back(NewLoad);
21961 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21962 }
21963
21964 // Process remaining bits of the load operation.
21965 // This is done by creating an UNDEF vector to match the size of the
21966 // 256-bit loads and inserting the remaining load to it. We extract the
21967 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21968 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21969 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21970 MVT RemainingVT = MVT::getVectorVT(
21972 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21973 SDValue NewPtr = DAG.getMemBasePlusOffset(
21974 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21975 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21976 SDValue RemainingLoad =
21977 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21978 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21979 LD->getMemOperand()->getFlags(), LD->getAAInfo());
21980 SDValue UndefVector = DAG.getUNDEF(NewVT);
21981 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21982 SDValue ExtendedReminingLoad =
21983 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21984 {UndefVector, RemainingLoad, InsertIdx});
21985 LoadOps.push_back(ExtendedReminingLoad);
21986 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
21987 EVT ConcatVT =
21989 LoadOps.size() * NewVT.getVectorNumElements());
21990 SDValue ConcatVectors =
21991 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
21992 // Extract the original vector type size.
21993 SDValue ExtractSubVector =
21994 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
21995 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
21996 SDValue TokenFactor =
21997 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21998 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
21999}
22000
22002 EVT VecVT = Op.getValueType();
22003 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22004 "Need boolean vector type.");
22005
22006 if (Depth > 3)
22008
22009 // We can get the base type from a vector compare or truncate.
22010 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22011 return Op.getOperand(0).getValueType();
22012
22013 // If an operand is a bool vector, continue looking.
22015 for (SDValue Operand : Op->op_values()) {
22016 if (Operand.getValueType() != VecVT)
22017 continue;
22018
22019 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22020 if (!BaseVT.isSimple())
22021 BaseVT = OperandVT;
22022 else if (OperandVT != BaseVT)
22024 }
22025
22026 return BaseVT;
22027}
22028
22029// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22030// iN, we can use a trick that extracts the i^th bit from the i^th element and
22031// then performs a vector add to get a scalar bitmask. This requires that each
22032// element's bits are either all 1 or all 0.
22034 SDLoc DL(N);
22035 SDValue ComparisonResult(N, 0);
22036 EVT VecVT = ComparisonResult.getValueType();
22037 assert(VecVT.isVector() && "Must be a vector type");
22038
22039 unsigned NumElts = VecVT.getVectorNumElements();
22040 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22041 return SDValue();
22042
22043 if (VecVT.getVectorElementType() != MVT::i1 &&
22044 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22045 return SDValue();
22046
22047 // If we can find the original types to work on instead of a vector of i1,
22048 // we can avoid extend/extract conversion instructions.
22049 if (VecVT.getVectorElementType() == MVT::i1) {
22050 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22051 if (!VecVT.isSimple()) {
22052 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22053 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22054 }
22055 }
22056 VecVT = VecVT.changeVectorElementTypeToInteger();
22057
22058 // Large vectors don't map directly to this conversion, so to avoid too many
22059 // edge cases, we don't apply it here. The conversion will likely still be
22060 // applied later via multiple smaller vectors, whose results are concatenated.
22061 if (VecVT.getSizeInBits() > 128)
22062 return SDValue();
22063
22064 // Ensure that all elements' bits are either 0s or 1s.
22065 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22066
22067 SmallVector<SDValue, 16> MaskConstants;
22068 if (VecVT == MVT::v16i8) {
22069 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22070 // per entry. We split it into two halves, apply the mask, zip the halves to
22071 // create 8x 16-bit values, and the perform the vector reduce.
22072 for (unsigned Half = 0; Half < 2; ++Half) {
22073 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22074 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22075 }
22076 }
22077 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22078 SDValue RepresentativeBits =
22079 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22080
22081 SDValue UpperRepresentativeBits =
22082 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22083 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22084 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22085 RepresentativeBits, UpperRepresentativeBits);
22086 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22087 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22088 }
22089
22090 // All other vector sizes.
22091 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22092 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22093 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22094 }
22095
22096 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22097 SDValue RepresentativeBits =
22098 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22099 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22100 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22101 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22102}
22103
22105 StoreSDNode *Store) {
22106 if (!Store->isTruncatingStore())
22107 return SDValue();
22108
22109 SDLoc DL(Store);
22110 SDValue VecOp = Store->getValue();
22111 EVT VT = VecOp.getValueType();
22112 EVT MemVT = Store->getMemoryVT();
22113
22114 if (!MemVT.isVector() || !VT.isVector() ||
22115 MemVT.getVectorElementType() != MVT::i1)
22116 return SDValue();
22117
22118 // If we are storing a vector that we are currently building, let
22119 // `scalarizeVectorStore()` handle this more efficiently.
22120 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22121 return SDValue();
22122
22123 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22124 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22125 if (!VectorBits)
22126 return SDValue();
22127
22128 EVT StoreVT =
22130 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22131 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22132 Store->getMemOperand());
22133}
22134
22136 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22137 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22138 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22139}
22140
22141// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22143 const AArch64Subtarget *Subtarget) {
22144 SDValue Value = ST->getValue();
22145 EVT ValueVT = Value.getValueType();
22146
22147 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22148 Value.getOpcode() != ISD::TRUNCATE ||
22149 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22150 return SDValue();
22151
22152 assert(ST->getOffset().isUndef() && "undef offset expected");
22153 SDLoc DL(ST);
22154 auto WideVT = EVT::getVectorVT(
22155 *DAG.getContext(),
22156 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22157 SDValue UndefVector = DAG.getUNDEF(WideVT);
22158 SDValue WideTrunc = DAG.getNode(
22159 ISD::INSERT_SUBVECTOR, DL, WideVT,
22160 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22161 SDValue Cast = DAG.getNode(
22162 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22163 WideTrunc);
22164
22166 SDValue Chain = ST->getChain();
22167 MachineMemOperand *MMO = ST->getMemOperand();
22168 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22169 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22170 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22171 TypeSize Offset2 = TypeSize::getFixed(2);
22172 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22173 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22174
22175 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22176 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22177 TypeSize Offset1 = TypeSize::getFixed(1);
22178 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22179 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22180
22181 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22182 DAG.getConstant(0, DL, MVT::i64));
22183 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22184 MF.getMachineMemOperand(MMO, 0, 1));
22185 return Chain;
22186}
22187
22190 SelectionDAG &DAG,
22191 const AArch64Subtarget *Subtarget) {
22192 StoreSDNode *ST = cast<StoreSDNode>(N);
22193 SDValue Chain = ST->getChain();
22194 SDValue Value = ST->getValue();
22195 SDValue Ptr = ST->getBasePtr();
22196 EVT ValueVT = Value.getValueType();
22197
22198 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22199 EVT EltVT = VT.getVectorElementType();
22200 return EltVT == MVT::f32 || EltVT == MVT::f64;
22201 };
22202
22203 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22204 return Res;
22205
22206 // If this is an FP_ROUND followed by a store, fold this into a truncating
22207 // store. We can do this even if this is already a truncstore.
22208 // We purposefully don't care about legality of the nodes here as we know
22209 // they can be split down into something legal.
22210 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22211 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22212 Subtarget->useSVEForFixedLengthVectors() &&
22213 ValueVT.isFixedLengthVector() &&
22214 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22215 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22216 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22217 ST->getMemoryVT(), ST->getMemOperand());
22218
22219 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22220 return Split;
22221
22222 if (Subtarget->supportsAddressTopByteIgnored() &&
22223 performTBISimplification(N->getOperand(2), DCI, DAG))
22224 return SDValue(N, 0);
22225
22226 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22227 return Store;
22228
22229 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22230 return Store;
22231
22232 if (ST->isTruncatingStore()) {
22233 EVT StoreVT = ST->getMemoryVT();
22234 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22235 return SDValue();
22236 if (SDValue Rshrnb =
22237 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22238 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22239 StoreVT, ST->getMemOperand());
22240 }
22241 }
22242
22243 return SDValue();
22244}
22245
22248 SelectionDAG &DAG,
22249 const AArch64Subtarget *Subtarget) {
22250 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22251 SDValue Value = MST->getValue();
22252 SDValue Mask = MST->getMask();
22253 SDLoc DL(N);
22254
22255 // If this is a UZP1 followed by a masked store, fold this into a masked
22256 // truncating store. We can do this even if this is already a masked
22257 // truncstore.
22258 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22259 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22260 Value.getValueType().isInteger()) {
22261 Value = Value.getOperand(0);
22262 if (Value.getOpcode() == ISD::BITCAST) {
22263 EVT HalfVT =
22264 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22265 EVT InVT = Value.getOperand(0).getValueType();
22266
22267 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22268 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22269 unsigned PgPattern = Mask->getConstantOperandVal(0);
22270
22271 // Ensure we can double the size of the predicate pattern
22272 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22273 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22274 MinSVESize) {
22275 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22276 PgPattern);
22277 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22278 MST->getBasePtr(), MST->getOffset(), Mask,
22279 MST->getMemoryVT(), MST->getMemOperand(),
22280 MST->getAddressingMode(),
22281 /*IsTruncating=*/true);
22282 }
22283 }
22284 }
22285 }
22286
22287 if (MST->isTruncatingStore()) {
22288 EVT ValueVT = Value->getValueType(0);
22289 EVT MemVT = MST->getMemoryVT();
22290 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22291 return SDValue();
22292 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22293 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22294 MST->getOffset(), MST->getMask(),
22295 MST->getMemoryVT(), MST->getMemOperand(),
22296 MST->getAddressingMode(), true);
22297 }
22298 }
22299
22300 return SDValue();
22301}
22302
22303/// \return true if part of the index was folded into the Base.
22304static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22305 SDLoc DL, SelectionDAG &DAG) {
22306 // This function assumes a vector of i64 indices.
22307 EVT IndexVT = Index.getValueType();
22308 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22309 return false;
22310
22311 // Simplify:
22312 // BasePtr = Ptr
22313 // Index = X + splat(Offset)
22314 // ->
22315 // BasePtr = Ptr + Offset * scale.
22316 // Index = X
22317 if (Index.getOpcode() == ISD::ADD) {
22318 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22319 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22320 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22321 Index = Index.getOperand(0);
22322 return true;
22323 }
22324 }
22325
22326 // Simplify:
22327 // BasePtr = Ptr
22328 // Index = (X + splat(Offset)) << splat(Shift)
22329 // ->
22330 // BasePtr = Ptr + (Offset << Shift) * scale)
22331 // Index = X << splat(shift)
22332 if (Index.getOpcode() == ISD::SHL &&
22333 Index.getOperand(0).getOpcode() == ISD::ADD) {
22334 SDValue Add = Index.getOperand(0);
22335 SDValue ShiftOp = Index.getOperand(1);
22336 SDValue OffsetOp = Add.getOperand(1);
22337 if (auto Shift = DAG.getSplatValue(ShiftOp))
22338 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22339 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22340 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22341 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22342 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22343 Add.getOperand(0), ShiftOp);
22344 return true;
22345 }
22346 }
22347
22348 return false;
22349}
22350
22351// Analyse the specified address returning true if a more optimal addressing
22352// mode is available. When returning true all parameters are updated to reflect
22353// their recommended values.
22355 SDValue &BasePtr, SDValue &Index,
22356 SelectionDAG &DAG) {
22357 // Try to iteratively fold parts of the index into the base pointer to
22358 // simplify the index as much as possible.
22359 bool Changed = false;
22360 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22361 Changed = true;
22362
22363 // Only consider element types that are pointer sized as smaller types can
22364 // be easily promoted.
22365 EVT IndexVT = Index.getValueType();
22366 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22367 return Changed;
22368
22369 // Can indices be trivially shrunk?
22370 EVT DataVT = N->getOperand(1).getValueType();
22371 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22372 // will later be re-extended to 64 bits in legalization
22373 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22374 return Changed;
22375 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22376 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22377 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22378 return true;
22379 }
22380
22381 // Match:
22382 // Index = step(const)
22383 int64_t Stride = 0;
22384 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22385 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22386 }
22387 // Match:
22388 // Index = step(const) << shift(const)
22389 else if (Index.getOpcode() == ISD::SHL &&
22390 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22391 SDValue RHS = Index.getOperand(1);
22392 if (auto *Shift =
22393 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22394 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22395 Stride = Step << Shift->getZExtValue();
22396 }
22397 }
22398
22399 // Return early because no supported pattern is found.
22400 if (Stride == 0)
22401 return Changed;
22402
22403 if (Stride < std::numeric_limits<int32_t>::min() ||
22404 Stride > std::numeric_limits<int32_t>::max())
22405 return Changed;
22406
22407 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22408 unsigned MaxVScale =
22410 int64_t LastElementOffset =
22411 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22412
22413 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22414 LastElementOffset > std::numeric_limits<int32_t>::max())
22415 return Changed;
22416
22417 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22418 // Stride does not scale explicitly by 'Scale', because it happens in
22419 // the gather/scatter addressing mode.
22420 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22421 return true;
22422}
22423
22426 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22427 assert(MGS && "Can only combine gather load or scatter store nodes");
22428
22429 if (!DCI.isBeforeLegalize())
22430 return SDValue();
22431
22432 SDLoc DL(MGS);
22433 SDValue Chain = MGS->getChain();
22434 SDValue Scale = MGS->getScale();
22435 SDValue Index = MGS->getIndex();
22436 SDValue Mask = MGS->getMask();
22437 SDValue BasePtr = MGS->getBasePtr();
22438 ISD::MemIndexType IndexType = MGS->getIndexType();
22439
22440 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22441 return SDValue();
22442
22443 // Here we catch such cases early and change MGATHER's IndexType to allow
22444 // the use of an Index that's more legalisation friendly.
22445 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22446 SDValue PassThru = MGT->getPassThru();
22447 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22448 return DAG.getMaskedGather(
22449 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22450 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22451 }
22452 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22453 SDValue Data = MSC->getValue();
22454 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22455 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22456 Ops, MSC->getMemOperand(), IndexType,
22457 MSC->isTruncatingStore());
22458}
22459
22460/// Target-specific DAG combine function for NEON load/store intrinsics
22461/// to merge base address updates.
22464 SelectionDAG &DAG) {
22465 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22466 return SDValue();
22467
22468 unsigned AddrOpIdx = N->getNumOperands() - 1;
22469 SDValue Addr = N->getOperand(AddrOpIdx);
22470
22471 // Search for a use of the address operand that is an increment.
22472 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22473 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22474 SDNode *User = *UI;
22475 if (User->getOpcode() != ISD::ADD ||
22476 UI.getUse().getResNo() != Addr.getResNo())
22477 continue;
22478
22479 // Check that the add is independent of the load/store. Otherwise, folding
22480 // it would create a cycle.
22483 Visited.insert(Addr.getNode());
22484 Worklist.push_back(N);
22485 Worklist.push_back(User);
22486 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22487 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22488 continue;
22489
22490 // Find the new opcode for the updating load/store.
22491 bool IsStore = false;
22492 bool IsLaneOp = false;
22493 bool IsDupOp = false;
22494 unsigned NewOpc = 0;
22495 unsigned NumVecs = 0;
22496 unsigned IntNo = N->getConstantOperandVal(1);
22497 switch (IntNo) {
22498 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22499 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22500 NumVecs = 2; break;
22501 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22502 NumVecs = 3; break;
22503 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22504 NumVecs = 4; break;
22505 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22506 NumVecs = 2; IsStore = true; break;
22507 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22508 NumVecs = 3; IsStore = true; break;
22509 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22510 NumVecs = 4; IsStore = true; break;
22511 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22512 NumVecs = 2; break;
22513 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22514 NumVecs = 3; break;
22515 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22516 NumVecs = 4; break;
22517 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22518 NumVecs = 2; IsStore = true; break;
22519 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22520 NumVecs = 3; IsStore = true; break;
22521 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22522 NumVecs = 4; IsStore = true; break;
22523 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22524 NumVecs = 2; IsDupOp = true; break;
22525 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22526 NumVecs = 3; IsDupOp = true; break;
22527 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22528 NumVecs = 4; IsDupOp = true; break;
22529 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22530 NumVecs = 2; IsLaneOp = true; break;
22531 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22532 NumVecs = 3; IsLaneOp = true; break;
22533 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22534 NumVecs = 4; IsLaneOp = true; break;
22535 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22536 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22537 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22538 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22539 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22540 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22541 }
22542
22543 EVT VecTy;
22544 if (IsStore)
22545 VecTy = N->getOperand(2).getValueType();
22546 else
22547 VecTy = N->getValueType(0);
22548
22549 // If the increment is a constant, it must match the memory ref size.
22550 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22551 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22552 uint32_t IncVal = CInc->getZExtValue();
22553 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22554 if (IsLaneOp || IsDupOp)
22555 NumBytes /= VecTy.getVectorNumElements();
22556 if (IncVal != NumBytes)
22557 continue;
22558 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22559 }
22561 Ops.push_back(N->getOperand(0)); // Incoming chain
22562 // Load lane and store have vector list as input.
22563 if (IsLaneOp || IsStore)
22564 for (unsigned i = 2; i < AddrOpIdx; ++i)
22565 Ops.push_back(N->getOperand(i));
22566 Ops.push_back(Addr); // Base register
22567 Ops.push_back(Inc);
22568
22569 // Return Types.
22570 EVT Tys[6];
22571 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22572 unsigned n;
22573 for (n = 0; n < NumResultVecs; ++n)
22574 Tys[n] = VecTy;
22575 Tys[n++] = MVT::i64; // Type of write back register
22576 Tys[n] = MVT::Other; // Type of the chain
22577 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22578
22579 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22580 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22581 MemInt->getMemoryVT(),
22582 MemInt->getMemOperand());
22583
22584 // Update the uses.
22585 std::vector<SDValue> NewResults;
22586 for (unsigned i = 0; i < NumResultVecs; ++i) {
22587 NewResults.push_back(SDValue(UpdN.getNode(), i));
22588 }
22589 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22590 DCI.CombineTo(N, NewResults);
22591 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22592
22593 break;
22594 }
22595 return SDValue();
22596}
22597
22598// Checks to see if the value is the prescribed width and returns information
22599// about its extension mode.
22600static
22601bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22602 ExtType = ISD::NON_EXTLOAD;
22603 switch(V.getNode()->getOpcode()) {
22604 default:
22605 return false;
22606 case ISD::LOAD: {
22607 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22608 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22609 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22610 ExtType = LoadNode->getExtensionType();
22611 return true;
22612 }
22613 return false;
22614 }
22615 case ISD::AssertSext: {
22616 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22617 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22618 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22619 ExtType = ISD::SEXTLOAD;
22620 return true;
22621 }
22622 return false;
22623 }
22624 case ISD::AssertZext: {
22625 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22626 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22627 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22628 ExtType = ISD::ZEXTLOAD;
22629 return true;
22630 }
22631 return false;
22632 }
22633 case ISD::Constant:
22634 case ISD::TargetConstant: {
22635 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22636 1LL << (width - 1);
22637 }
22638 }
22639
22640 return true;
22641}
22642
22643// This function does a whole lot of voodoo to determine if the tests are
22644// equivalent without and with a mask. Essentially what happens is that given a
22645// DAG resembling:
22646//
22647// +-------------+ +-------------+ +-------------+ +-------------+
22648// | Input | | AddConstant | | CompConstant| | CC |
22649// +-------------+ +-------------+ +-------------+ +-------------+
22650// | | | |
22651// V V | +----------+
22652// +-------------+ +----+ | |
22653// | ADD | |0xff| | |
22654// +-------------+ +----+ | |
22655// | | | |
22656// V V | |
22657// +-------------+ | |
22658// | AND | | |
22659// +-------------+ | |
22660// | | |
22661// +-----+ | |
22662// | | |
22663// V V V
22664// +-------------+
22665// | CMP |
22666// +-------------+
22667//
22668// The AND node may be safely removed for some combinations of inputs. In
22669// particular we need to take into account the extension type of the Input,
22670// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22671// width of the input (this can work for any width inputs, the above graph is
22672// specific to 8 bits.
22673//
22674// The specific equations were worked out by generating output tables for each
22675// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22676// problem was simplified by working with 4 bit inputs, which means we only
22677// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22678// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22679// patterns present in both extensions (0,7). For every distinct set of
22680// AddConstant and CompConstants bit patterns we can consider the masked and
22681// unmasked versions to be equivalent if the result of this function is true for
22682// all 16 distinct bit patterns of for the current extension type of Input (w0).
22683//
22684// sub w8, w0, w1
22685// and w10, w8, #0x0f
22686// cmp w8, w2
22687// cset w9, AArch64CC
22688// cmp w10, w2
22689// cset w11, AArch64CC
22690// cmp w9, w11
22691// cset w0, eq
22692// ret
22693//
22694// Since the above function shows when the outputs are equivalent it defines
22695// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22696// would be expensive to run during compiles. The equations below were written
22697// in a test harness that confirmed they gave equivalent outputs to the above
22698// for all inputs function, so they can be used determine if the removal is
22699// legal instead.
22700//
22701// isEquivalentMaskless() is the code for testing if the AND can be removed
22702// factored out of the DAG recognition as the DAG can take several forms.
22703
22704static bool isEquivalentMaskless(unsigned CC, unsigned width,
22705 ISD::LoadExtType ExtType, int AddConstant,
22706 int CompConstant) {
22707 // By being careful about our equations and only writing the in term
22708 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22709 // make them generally applicable to all bit widths.
22710 int MaxUInt = (1 << width);
22711
22712 // For the purposes of these comparisons sign extending the type is
22713 // equivalent to zero extending the add and displacing it by half the integer
22714 // width. Provided we are careful and make sure our equations are valid over
22715 // the whole range we can just adjust the input and avoid writing equations
22716 // for sign extended inputs.
22717 if (ExtType == ISD::SEXTLOAD)
22718 AddConstant -= (1 << (width-1));
22719
22720 switch(CC) {
22721 case AArch64CC::LE:
22722 case AArch64CC::GT:
22723 if ((AddConstant == 0) ||
22724 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22725 (AddConstant >= 0 && CompConstant < 0) ||
22726 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22727 return true;
22728 break;
22729 case AArch64CC::LT:
22730 case AArch64CC::GE:
22731 if ((AddConstant == 0) ||
22732 (AddConstant >= 0 && CompConstant <= 0) ||
22733 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22734 return true;
22735 break;
22736 case AArch64CC::HI:
22737 case AArch64CC::LS:
22738 if ((AddConstant >= 0 && CompConstant < 0) ||
22739 (AddConstant <= 0 && CompConstant >= -1 &&
22740 CompConstant < AddConstant + MaxUInt))
22741 return true;
22742 break;
22743 case AArch64CC::PL:
22744 case AArch64CC::MI:
22745 if ((AddConstant == 0) ||
22746 (AddConstant > 0 && CompConstant <= 0) ||
22747 (AddConstant < 0 && CompConstant <= AddConstant))
22748 return true;
22749 break;
22750 case AArch64CC::LO:
22751 case AArch64CC::HS:
22752 if ((AddConstant >= 0 && CompConstant <= 0) ||
22753 (AddConstant <= 0 && CompConstant >= 0 &&
22754 CompConstant <= AddConstant + MaxUInt))
22755 return true;
22756 break;
22757 case AArch64CC::EQ:
22758 case AArch64CC::NE:
22759 if ((AddConstant > 0 && CompConstant < 0) ||
22760 (AddConstant < 0 && CompConstant >= 0 &&
22761 CompConstant < AddConstant + MaxUInt) ||
22762 (AddConstant >= 0 && CompConstant >= 0 &&
22763 CompConstant >= AddConstant) ||
22764 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22765 return true;
22766 break;
22767 case AArch64CC::VS:
22768 case AArch64CC::VC:
22769 case AArch64CC::AL:
22770 case AArch64CC::NV:
22771 return true;
22772 case AArch64CC::Invalid:
22773 break;
22774 }
22775
22776 return false;
22777}
22778
22779// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22780// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22782 SDNode *AndNode, SelectionDAG &DAG,
22783 unsigned CCIndex, unsigned CmpIndex,
22784 unsigned CC) {
22785 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22786 if (!SubsC)
22787 return SDValue();
22788
22789 APInt SubsAP = SubsC->getAPIntValue();
22790 if (CC == AArch64CC::HI) {
22791 if (!SubsAP.isMask())
22792 return SDValue();
22793 } else if (CC == AArch64CC::LO) {
22794 if (!SubsAP.isPowerOf2())
22795 return SDValue();
22796 } else
22797 return SDValue();
22798
22799 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22800 if (!AndC)
22801 return SDValue();
22802
22803 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22804
22805 SDLoc DL(N);
22806 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22807 SDValue ANDS = DAG.getNode(
22808 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22809 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22810 SDValue AArch64_CC =
22812 N->getOperand(CCIndex)->getValueType(0));
22813
22814 // For now, only performCSELCombine and performBRCONDCombine call this
22815 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22816 // operands. So just init the ops direct to simplify the code. If we have some
22817 // other case with different CCIndex, CmpIndex, we need to use for loop to
22818 // rewrite the code here.
22819 // TODO: Do we need to assert number of operand is 4 here?
22820 assert((CCIndex == 2 && CmpIndex == 3) &&
22821 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22822 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22823 ANDS.getValue(1)};
22824 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22825}
22826
22827static
22830 SelectionDAG &DAG, unsigned CCIndex,
22831 unsigned CmpIndex) {
22832 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22833 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22834 unsigned CondOpcode = SubsNode->getOpcode();
22835
22836 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
22837 !SubsNode->hasOneUse())
22838 return SDValue();
22839
22840 // There is a SUBS feeding this condition. Is it fed by a mask we can
22841 // use?
22842
22843 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22844 unsigned MaskBits = 0;
22845
22846 if (AndNode->getOpcode() != ISD::AND)
22847 return SDValue();
22848
22849 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22850 CmpIndex, CC))
22851 return Val;
22852
22853 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22854 uint32_t CNV = CN->getZExtValue();
22855 if (CNV == 255)
22856 MaskBits = 8;
22857 else if (CNV == 65535)
22858 MaskBits = 16;
22859 }
22860
22861 if (!MaskBits)
22862 return SDValue();
22863
22864 SDValue AddValue = AndNode->getOperand(0);
22865
22866 if (AddValue.getOpcode() != ISD::ADD)
22867 return SDValue();
22868
22869 // The basic dag structure is correct, grab the inputs and validate them.
22870
22871 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22872 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22873 SDValue SubsInputValue = SubsNode->getOperand(1);
22874
22875 // The mask is present and the provenance of all the values is a smaller type,
22876 // lets see if the mask is superfluous.
22877
22878 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22879 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22880 return SDValue();
22881
22882 ISD::LoadExtType ExtType;
22883
22884 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22885 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22886 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22887 return SDValue();
22888
22889 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22890 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22891 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22892 return SDValue();
22893
22894 // The AND is not necessary, remove it.
22895
22896 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22897 SubsNode->getValueType(1));
22898 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22899
22900 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22901 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22902
22903 return SDValue(N, 0);
22904}
22905
22906// Optimize compare with zero and branch.
22909 SelectionDAG &DAG) {
22911 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22912 // will not be produced, as they are conditional branch instructions that do
22913 // not set flags.
22914 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22915 return SDValue();
22916
22917 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22918 N = NV.getNode();
22919 SDValue Chain = N->getOperand(0);
22920 SDValue Dest = N->getOperand(1);
22921 SDValue CCVal = N->getOperand(2);
22922 SDValue Cmp = N->getOperand(3);
22923
22924 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22925 unsigned CC = CCVal->getAsZExtVal();
22926 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22927 return SDValue();
22928
22929 unsigned CmpOpc = Cmp.getOpcode();
22930 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22931 return SDValue();
22932
22933 // Only attempt folding if there is only one use of the flag and no use of the
22934 // value.
22935 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22936 return SDValue();
22937
22938 SDValue LHS = Cmp.getOperand(0);
22939 SDValue RHS = Cmp.getOperand(1);
22940
22941 assert(LHS.getValueType() == RHS.getValueType() &&
22942 "Expected the value type to be the same for both operands!");
22943 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22944 return SDValue();
22945
22946 if (isNullConstant(LHS))
22947 std::swap(LHS, RHS);
22948
22949 if (!isNullConstant(RHS))
22950 return SDValue();
22951
22952 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22953 LHS.getOpcode() == ISD::SRL)
22954 return SDValue();
22955
22956 // Fold the compare into the branch instruction.
22957 SDValue BR;
22958 if (CC == AArch64CC::EQ)
22959 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22960 else
22961 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22962
22963 // Do not add new nodes to DAG combiner worklist.
22964 DCI.CombineTo(N, BR, false);
22965
22966 return SDValue();
22967}
22968
22970 unsigned CC = N->getConstantOperandVal(2);
22971 SDValue SUBS = N->getOperand(3);
22972 SDValue Zero, CTTZ;
22973
22974 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22975 Zero = N->getOperand(0);
22976 CTTZ = N->getOperand(1);
22977 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22978 Zero = N->getOperand(1);
22979 CTTZ = N->getOperand(0);
22980 } else
22981 return SDValue();
22982
22983 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22984 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22985 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22986 return SDValue();
22987
22988 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22989 "Illegal type in CTTZ folding");
22990
22991 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
22992 return SDValue();
22993
22994 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22995 ? CTTZ.getOperand(0).getOperand(0)
22996 : CTTZ.getOperand(0);
22997
22998 if (X != SUBS.getOperand(0))
22999 return SDValue();
23000
23001 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23002 ? CTTZ.getOperand(0).getValueSizeInBits()
23003 : CTTZ.getValueSizeInBits();
23004 SDValue BitWidthMinusOne =
23005 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23006 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23007 BitWidthMinusOne);
23008}
23009
23010// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23011// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23012// Where x and y are constants and x != y
23013
23014// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23015// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23016// Where x and y are constants and x != y
23018 SDValue L = Op->getOperand(0);
23019 SDValue R = Op->getOperand(1);
23020 AArch64CC::CondCode OpCC =
23021 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23022
23023 SDValue OpCmp = Op->getOperand(3);
23024 if (!isCMP(OpCmp))
23025 return SDValue();
23026
23027 SDValue CmpLHS = OpCmp.getOperand(0);
23028 SDValue CmpRHS = OpCmp.getOperand(1);
23029
23030 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23031 std::swap(CmpLHS, CmpRHS);
23032 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23033 return SDValue();
23034
23035 SDValue X = CmpLHS->getOperand(0);
23036 SDValue Y = CmpLHS->getOperand(1);
23037 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23038 return SDValue();
23039 }
23040
23041 // If one of the constant is opaque constant, x,y sdnode is still different
23042 // but the real value maybe the same. So check APInt here to make sure the
23043 // code is correct.
23044 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23045 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23046 if (CX->getAPIntValue() == CY->getAPIntValue())
23047 return SDValue();
23048
23050 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23051 SDValue Cond = CmpLHS->getOperand(3);
23052
23053 if (CmpRHS == Y)
23055 else if (CmpRHS != X)
23056 return SDValue();
23057
23058 if (OpCC == AArch64CC::NE)
23060 else if (OpCC != AArch64CC::EQ)
23061 return SDValue();
23062
23063 SDLoc DL(Op);
23064 EVT VT = Op->getValueType(0);
23065
23066 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23067 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23068}
23069
23070// Optimize CSEL instructions
23073 SelectionDAG &DAG) {
23074 // CSEL x, x, cc -> x
23075 if (N->getOperand(0) == N->getOperand(1))
23076 return N->getOperand(0);
23077
23078 if (SDValue R = foldCSELOfCSEL(N, DAG))
23079 return R;
23080
23081 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23082 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23083 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23084 return Folded;
23085
23086 return performCONDCombine(N, DCI, DAG, 2, 3);
23087}
23088
23089// Try to re-use an already extended operand of a vector SetCC feeding a
23090// extended select. Doing so avoids requiring another full extension of the
23091// SET_CC result when lowering the select.
23093 EVT Op0MVT = Op->getOperand(0).getValueType();
23094 if (!Op0MVT.isVector() || Op->use_empty())
23095 return SDValue();
23096
23097 // Make sure that all uses of Op are VSELECTs with result matching types where
23098 // the result type has a larger element type than the SetCC operand.
23099 SDNode *FirstUse = *Op->use_begin();
23100 if (FirstUse->getOpcode() != ISD::VSELECT)
23101 return SDValue();
23102 EVT UseMVT = FirstUse->getValueType(0);
23103 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23104 return SDValue();
23105 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23106 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23107 }))
23108 return SDValue();
23109
23110 APInt V;
23111 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23112 return SDValue();
23113
23114 SDLoc DL(Op);
23115 SDValue Op0ExtV;
23116 SDValue Op1ExtV;
23117 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23118 // Check if the first operand of the SET_CC is already extended. If it is,
23119 // split the SET_CC and re-use the extended version of the operand.
23120 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23121 Op->getOperand(0));
23122 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23123 Op->getOperand(0));
23124 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23125 Op0ExtV = SDValue(Op0SExt, 0);
23126 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23127 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23128 Op0ExtV = SDValue(Op0ZExt, 0);
23129 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23130 } else
23131 return SDValue();
23132
23133 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23134 Op0ExtV, Op1ExtV, Op->getOperand(2));
23135}
23136
23137static SDValue
23139 SelectionDAG &DAG) {
23140 SDValue Vec = N->getOperand(0);
23141 if (DCI.isBeforeLegalize() &&
23142 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23145 SDLoc DL(N);
23146 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23147 DAG);
23148 }
23149
23150 return SDValue();
23151}
23152
23155 SelectionDAG &DAG) {
23156 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23157 SDValue LHS = N->getOperand(0);
23158 SDValue RHS = N->getOperand(1);
23159 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23160 SDLoc DL(N);
23161 EVT VT = N->getValueType(0);
23162
23163 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23164 return V;
23165
23166 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23167 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23168 LHS->getOpcode() == AArch64ISD::CSEL &&
23169 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23170 LHS->hasOneUse()) {
23171 // Invert CSEL's condition.
23172 auto OldCond =
23173 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23174 auto NewCond = getInvertedCondCode(OldCond);
23175
23176 // csel 0, 1, !cond, X
23177 SDValue CSEL =
23178 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23179 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23180 LHS.getOperand(3));
23181 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23182 }
23183
23184 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23185 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23186 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23187 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23188 LHS->hasOneUse()) {
23189 EVT TstVT = LHS->getValueType(0);
23190 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23191 // this pattern will get better opt in emitComparison
23192 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23193 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23194 DAG.getConstant(TstImm, DL, TstVT));
23195 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23196 }
23197 }
23198
23199 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23200 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23201 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23202 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23203 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23204 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23206 LHS->getOpcode() == ISD::BITCAST) {
23207 EVT ToVT = LHS->getValueType(0);
23208 EVT FromVT = LHS->getOperand(0).getValueType();
23209 if (FromVT.isFixedLengthVector() &&
23210 FromVT.getVectorElementType() == MVT::i1) {
23211 bool IsNull = isNullConstant(RHS);
23213 DL, MVT::i1, LHS->getOperand(0));
23214 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23215 LHS);
23216 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23217 }
23218 }
23219
23220 // Try to perform the memcmp when the result is tested for [in]equality with 0
23221 if (SDValue V = performOrXorChainCombine(N, DAG))
23222 return V;
23223
23224 return SDValue();
23225}
23226
23227// Replace a flag-setting operator (eg ANDS) with the generic version
23228// (eg AND) if the flag is unused.
23231 unsigned GenericOpcode) {
23232 SDLoc DL(N);
23233 SDValue LHS = N->getOperand(0);
23234 SDValue RHS = N->getOperand(1);
23235 EVT VT = N->getValueType(0);
23236
23237 // If the flag result isn't used, convert back to a generic opcode.
23238 if (!N->hasAnyUseOfValue(1)) {
23239 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23240 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23241 DL);
23242 }
23243
23244 // Combine identical generic nodes into this node, re-using the result.
23245 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23246 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23247 DCI.CombineTo(Generic, SDValue(N, 0));
23248
23249 return SDValue();
23250}
23251
23253 // setcc_merge_zero pred
23254 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23255 // => extract_subvector (inner setcc_merge_zero)
23256 SDValue Pred = N->getOperand(0);
23257 SDValue LHS = N->getOperand(1);
23258 SDValue RHS = N->getOperand(2);
23259 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23260
23261 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23262 LHS->getOpcode() != ISD::SIGN_EXTEND)
23263 return SDValue();
23264
23265 SDValue Extract = LHS->getOperand(0);
23266 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23267 Extract->getValueType(0) != N->getValueType(0) ||
23268 Extract->getConstantOperandVal(1) != 0)
23269 return SDValue();
23270
23271 SDValue InnerSetCC = Extract->getOperand(0);
23272 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23273 return SDValue();
23274
23275 // By this point we've effectively got
23276 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23277 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23278 // can operate on A directly.
23279 SDValue InnerPred = InnerSetCC.getOperand(0);
23280 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23281 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23282 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23283 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23284 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23285 return Extract;
23286
23287 return SDValue();
23288}
23289
23290static SDValue
23292 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23293 "Unexpected opcode!");
23294
23295 SelectionDAG &DAG = DCI.DAG;
23296 SDValue Pred = N->getOperand(0);
23297 SDValue LHS = N->getOperand(1);
23298 SDValue RHS = N->getOperand(2);
23299 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23300
23301 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23302 return V;
23303
23304 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23305 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23306 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23307 // setcc_merge_zero(
23308 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23309 // => setcc_merge_zero(pred, ...)
23310 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23311 LHS->getOperand(0)->getOperand(0) == Pred)
23312 return LHS->getOperand(0);
23313
23314 // setcc_merge_zero(
23315 // all_active, extend(nxvNi1 ...), != splat(0))
23316 // -> nxvNi1 ...
23317 if (isAllActivePredicate(DAG, Pred))
23318 return LHS->getOperand(0);
23319
23320 // setcc_merge_zero(
23321 // pred, extend(nxvNi1 ...), != splat(0))
23322 // -> nxvNi1 and(pred, ...)
23323 if (DCI.isAfterLegalizeDAG())
23324 // Do this after legalization to allow more folds on setcc_merge_zero
23325 // to be recognized.
23326 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23327 LHS->getOperand(0), Pred);
23328 }
23329
23330 return SDValue();
23331}
23332
23333// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23334// as well as whether the test should be inverted. This code is required to
23335// catch these cases (as opposed to standard dag combines) because
23336// AArch64ISD::TBZ is matched during legalization.
23337static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23338 SelectionDAG &DAG) {
23339
23340 if (!Op->hasOneUse())
23341 return Op;
23342
23343 // We don't handle undef/constant-fold cases below, as they should have
23344 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23345 // etc.)
23346
23347 // (tbz (trunc x), b) -> (tbz x, b)
23348 // This case is just here to enable more of the below cases to be caught.
23349 if (Op->getOpcode() == ISD::TRUNCATE &&
23350 Bit < Op->getValueType(0).getSizeInBits()) {
23351 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23352 }
23353
23354 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23355 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23356 Bit < Op->getOperand(0).getValueSizeInBits()) {
23357 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23358 }
23359
23360 if (Op->getNumOperands() != 2)
23361 return Op;
23362
23363 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23364 if (!C)
23365 return Op;
23366
23367 switch (Op->getOpcode()) {
23368 default:
23369 return Op;
23370
23371 // (tbz (and x, m), b) -> (tbz x, b)
23372 case ISD::AND:
23373 if ((C->getZExtValue() >> Bit) & 1)
23374 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23375 return Op;
23376
23377 // (tbz (shl x, c), b) -> (tbz x, b-c)
23378 case ISD::SHL:
23379 if (C->getZExtValue() <= Bit &&
23380 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23381 Bit = Bit - C->getZExtValue();
23382 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23383 }
23384 return Op;
23385
23386 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23387 case ISD::SRA:
23388 Bit = Bit + C->getZExtValue();
23389 if (Bit >= Op->getValueType(0).getSizeInBits())
23390 Bit = Op->getValueType(0).getSizeInBits() - 1;
23391 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23392
23393 // (tbz (srl x, c), b) -> (tbz x, b+c)
23394 case ISD::SRL:
23395 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23396 Bit = Bit + C->getZExtValue();
23397 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23398 }
23399 return Op;
23400
23401 // (tbz (xor x, -1), b) -> (tbnz x, b)
23402 case ISD::XOR:
23403 if ((C->getZExtValue() >> Bit) & 1)
23404 Invert = !Invert;
23405 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23406 }
23407}
23408
23409// Optimize test single bit zero/non-zero and branch.
23412 SelectionDAG &DAG) {
23413 unsigned Bit = N->getConstantOperandVal(2);
23414 bool Invert = false;
23415 SDValue TestSrc = N->getOperand(1);
23416 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23417
23418 if (TestSrc == NewTestSrc)
23419 return SDValue();
23420
23421 unsigned NewOpc = N->getOpcode();
23422 if (Invert) {
23423 if (NewOpc == AArch64ISD::TBZ)
23424 NewOpc = AArch64ISD::TBNZ;
23425 else {
23426 assert(NewOpc == AArch64ISD::TBNZ);
23427 NewOpc = AArch64ISD::TBZ;
23428 }
23429 }
23430
23431 SDLoc DL(N);
23432 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23433 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23434}
23435
23436// Swap vselect operands where it may allow a predicated operation to achieve
23437// the `sel`.
23438//
23439// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23440// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23442 auto SelectA = N->getOperand(1);
23443 auto SelectB = N->getOperand(2);
23444 auto NTy = N->getValueType(0);
23445
23446 if (!NTy.isScalableVector())
23447 return SDValue();
23448 SDValue SetCC = N->getOperand(0);
23449 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23450 return SDValue();
23451
23452 switch (SelectB.getOpcode()) {
23453 default:
23454 return SDValue();
23455 case ISD::FMUL:
23456 case ISD::FSUB:
23457 case ISD::FADD:
23458 break;
23459 }
23460 if (SelectA != SelectB.getOperand(0))
23461 return SDValue();
23462
23463 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23464 ISD::CondCode InverseCC =
23466 auto InverseSetCC =
23467 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23468 SetCC.getOperand(1), InverseCC);
23469
23470 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23471 {InverseSetCC, SelectB, SelectA});
23472}
23473
23474// vselect (v1i1 setcc) ->
23475// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23476// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23477// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23478// such VSELECT.
23480 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23481 return SwapResult;
23482
23483 SDValue N0 = N->getOperand(0);
23484 EVT CCVT = N0.getValueType();
23485
23486 if (isAllActivePredicate(DAG, N0))
23487 return N->getOperand(1);
23488
23489 if (isAllInactivePredicate(N0))
23490 return N->getOperand(2);
23491
23492 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23493 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23494 // supported types.
23495 SDValue SetCC = N->getOperand(0);
23496 if (SetCC.getOpcode() == ISD::SETCC &&
23497 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23498 SDValue CmpLHS = SetCC.getOperand(0);
23499 EVT VT = CmpLHS.getValueType();
23500 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23501 SDNode *SplatLHS = N->getOperand(1).getNode();
23502 SDNode *SplatRHS = N->getOperand(2).getNode();
23503 APInt SplatLHSVal;
23504 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23505 VT.isSimple() &&
23506 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23507 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23508 VT.getSimpleVT().SimpleTy) &&
23509 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23510 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23512 unsigned NumElts = VT.getVectorNumElements();
23514 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23515 VT.getScalarType()));
23516 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23517
23518 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23519 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23520 return Or;
23521 }
23522 }
23523
23524 EVT CmpVT = N0.getOperand(0).getValueType();
23525 if (N0.getOpcode() != ISD::SETCC ||
23527 CCVT.getVectorElementType() != MVT::i1 ||
23529 return SDValue();
23530
23531 EVT ResVT = N->getValueType(0);
23532 // Only combine when the result type is of the same size as the compared
23533 // operands.
23534 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23535 return SDValue();
23536
23537 SDValue IfTrue = N->getOperand(1);
23538 SDValue IfFalse = N->getOperand(2);
23539 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23540 N0.getOperand(0), N0.getOperand(1),
23541 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23542 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23543 IfTrue, IfFalse);
23544}
23545
23546/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23547/// the compare-mask instructions rather than going via NZCV, even if LHS and
23548/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23549/// with a vector one followed by a DUP shuffle on the result.
23552 SelectionDAG &DAG = DCI.DAG;
23553 SDValue N0 = N->getOperand(0);
23554 EVT ResVT = N->getValueType(0);
23555
23556 if (N0.getOpcode() != ISD::SETCC)
23557 return SDValue();
23558
23559 if (ResVT.isScalableVT())
23560 return SDValue();
23561
23562 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23563 // scalar SetCCResultType. We also don't expect vectors, because we assume
23564 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23565 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23566 "Scalar-SETCC feeding SELECT has unexpected result type!");
23567
23568 // If NumMaskElts == 0, the comparison is larger than select result. The
23569 // largest real NEON comparison is 64-bits per lane, which means the result is
23570 // at most 32-bits and an illegal vector. Just bail out for now.
23571 EVT SrcVT = N0.getOperand(0).getValueType();
23572
23573 // Don't try to do this optimization when the setcc itself has i1 operands.
23574 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23575 // ruled out to prevent the creation of setcc that need to be scalarized.
23576 if (SrcVT == MVT::i1 ||
23577 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23578 return SDValue();
23579
23580 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23581 if (!ResVT.isVector() || NumMaskElts == 0)
23582 return SDValue();
23583
23584 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23586
23587 // Also bail out if the vector CCVT isn't the same size as ResVT.
23588 // This can happen if the SETCC operand size doesn't divide the ResVT size
23589 // (e.g., f64 vs v3f32).
23590 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23591 return SDValue();
23592
23593 // Make sure we didn't create illegal types, if we're not supposed to.
23594 assert(DCI.isBeforeLegalize() ||
23595 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23596
23597 // First perform a vector comparison, where lane 0 is the one we're interested
23598 // in.
23599 SDLoc DL(N0);
23600 SDValue LHS =
23601 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23602 SDValue RHS =
23603 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23604 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23605
23606 // Now duplicate the comparison mask we want across all other lanes.
23607 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23608 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23609 Mask = DAG.getNode(ISD::BITCAST, DL,
23610 ResVT.changeVectorElementTypeToInteger(), Mask);
23611
23612 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23613}
23614
23617 EVT VT = N->getValueType(0);
23618 SDLoc DL(N);
23619 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23620 // 128bit vector version.
23621 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23623 SmallVector<SDValue> Ops(N->ops());
23624 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23625 DCI.DAG.getVTList(LVT), Ops)) {
23626 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23627 DCI.DAG.getConstant(0, DL, MVT::i64));
23628 }
23629 }
23630
23631 if (N->getOpcode() == AArch64ISD::DUP) {
23632 if (DCI.isAfterLegalizeDAG()) {
23633 // If scalar dup's operand is extract_vector_elt, try to combine them into
23634 // duplane. For example,
23635 //
23636 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23637 // t18: v4i32 = AArch64ISD::DUP t21
23638 // ==>
23639 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23640 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23641 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23642 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23643 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23644 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23645 EXTRACT_VEC_ELT.getOperand(1));
23646 }
23647 }
23648 }
23649
23650 return performPostLD1Combine(N, DCI, false);
23651 }
23652
23653 return SDValue();
23654}
23655
23656/// Get rid of unnecessary NVCASTs (that don't change the type).
23658 if (N->getValueType(0) == N->getOperand(0).getValueType())
23659 return N->getOperand(0);
23660 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23661 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23662 N->getOperand(0).getOperand(0));
23663
23664 return SDValue();
23665}
23666
23667// If all users of the globaladdr are of the form (globaladdr + constant), find
23668// the smallest constant, fold it into the globaladdr's offset and rewrite the
23669// globaladdr as (globaladdr + constant) - constant.
23671 const AArch64Subtarget *Subtarget,
23672 const TargetMachine &TM) {
23673 auto *GN = cast<GlobalAddressSDNode>(N);
23674 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23676 return SDValue();
23677
23678 uint64_t MinOffset = -1ull;
23679 for (SDNode *N : GN->uses()) {
23680 if (N->getOpcode() != ISD::ADD)
23681 return SDValue();
23682 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23683 if (!C)
23684 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23685 if (!C)
23686 return SDValue();
23687 MinOffset = std::min(MinOffset, C->getZExtValue());
23688 }
23689 uint64_t Offset = MinOffset + GN->getOffset();
23690
23691 // Require that the new offset is larger than the existing one. Otherwise, we
23692 // can end up oscillating between two possible DAGs, for example,
23693 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23694 if (Offset <= uint64_t(GN->getOffset()))
23695 return SDValue();
23696
23697 // Check whether folding this offset is legal. It must not go out of bounds of
23698 // the referenced object to avoid violating the code model, and must be
23699 // smaller than 2^20 because this is the largest offset expressible in all
23700 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23701 // stores an immediate signed 21 bit offset.)
23702 //
23703 // This check also prevents us from folding negative offsets, which will end
23704 // up being treated in the same way as large positive ones. They could also
23705 // cause code model violations, and aren't really common enough to matter.
23706 if (Offset >= (1 << 20))
23707 return SDValue();
23708
23709 const GlobalValue *GV = GN->getGlobal();
23710 Type *T = GV->getValueType();
23711 if (!T->isSized() ||
23713 return SDValue();
23714
23715 SDLoc DL(GN);
23716 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23717 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23718 DAG.getConstant(MinOffset, DL, MVT::i64));
23719}
23720
23722 const AArch64Subtarget *Subtarget) {
23723 SDValue BR = N->getOperand(0);
23724 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23725 !BR.getValueType().isScalarInteger())
23726 return SDValue();
23727
23728 SDLoc DL(N);
23729 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23730}
23731
23732// Turns the vector of indices into a vector of byte offstes by scaling Offset
23733// by (BitWidth / 8).
23735 SDLoc DL, unsigned BitWidth) {
23736 assert(Offset.getValueType().isScalableVector() &&
23737 "This method is only for scalable vectors of offsets");
23738
23739 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23740 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23741
23742 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23743}
23744
23745/// Check if the value of \p OffsetInBytes can be used as an immediate for
23746/// the gather load/prefetch and scatter store instructions with vector base and
23747/// immediate offset addressing mode:
23748///
23749/// [<Zn>.[S|D]{, #<imm>}]
23750///
23751/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23752inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23753 unsigned ScalarSizeInBytes) {
23754 // The immediate is not a multiple of the scalar size.
23755 if (OffsetInBytes % ScalarSizeInBytes)
23756 return false;
23757
23758 // The immediate is out of range.
23759 if (OffsetInBytes / ScalarSizeInBytes > 31)
23760 return false;
23761
23762 return true;
23763}
23764
23765/// Check if the value of \p Offset represents a valid immediate for the SVE
23766/// gather load/prefetch and scatter store instructiona with vector base and
23767/// immediate offset addressing mode:
23768///
23769/// [<Zn>.[S|D]{, #<imm>}]
23770///
23771/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23773 unsigned ScalarSizeInBytes) {
23774 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23775 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23776 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23777}
23778
23780 unsigned Opcode,
23781 bool OnlyPackedOffsets = true) {
23782 const SDValue Src = N->getOperand(2);
23783 const EVT SrcVT = Src->getValueType(0);
23784 assert(SrcVT.isScalableVector() &&
23785 "Scatter stores are only possible for SVE vectors");
23786
23787 SDLoc DL(N);
23788 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23789
23790 // Make sure that source data will fit into an SVE register
23792 return SDValue();
23793
23794 // For FPs, ACLE only supports _packed_ single and double precision types.
23795 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23796 if (SrcElVT.isFloatingPoint())
23797 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23798 ((Opcode != AArch64ISD::SST1Q_PRED &&
23799 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23800 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23801 return SDValue();
23802
23803 // Depending on the addressing mode, this is either a pointer or a vector of
23804 // pointers (that fits into one register)
23805 SDValue Base = N->getOperand(4);
23806 // Depending on the addressing mode, this is either a single offset or a
23807 // vector of offsets (that fits into one register)
23808 SDValue Offset = N->getOperand(5);
23809
23810 // For "scalar + vector of indices", just scale the indices. This only
23811 // applies to non-temporal scatters because there's no instruction that takes
23812 // indicies.
23813 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23814 Offset =
23816 Opcode = AArch64ISD::SSTNT1_PRED;
23817 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23818 Offset =
23820 Opcode = AArch64ISD::SST1Q_PRED;
23821 }
23822
23823 // In the case of non-temporal gather loads there's only one SVE instruction
23824 // per data-size: "scalar + vector", i.e.
23825 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23826 // Since we do have intrinsics that allow the arguments to be in a different
23827 // order, we may need to swap them to match the spec.
23828 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23829 Offset.getValueType().isVector())
23831
23832 // SST1_IMM requires that the offset is an immediate that is:
23833 // * a multiple of #SizeInBytes,
23834 // * in the range [0, 31 x #SizeInBytes],
23835 // where #SizeInBytes is the size in bytes of the stored items. For
23836 // immediates outside that range and non-immediate scalar offsets use SST1 or
23837 // SST1_UXTW instead.
23838 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23840 SrcVT.getScalarSizeInBits() / 8)) {
23841 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23843 else
23844 Opcode = AArch64ISD::SST1_PRED;
23845
23847 }
23848 }
23849
23850 auto &TLI = DAG.getTargetLoweringInfo();
23851 if (!TLI.isTypeLegal(Base.getValueType()))
23852 return SDValue();
23853
23854 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23855 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23856 // nxv2i64. Legalize accordingly.
23857 if (!OnlyPackedOffsets &&
23858 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23859 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23860
23861 if (!TLI.isTypeLegal(Offset.getValueType()))
23862 return SDValue();
23863
23864 // Source value type that is representable in hardware
23865 EVT HwSrcVt = getSVEContainerType(SrcVT);
23866
23867 // Keep the original type of the input data to store - this is needed to be
23868 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23869 // FP values we want the integer equivalent, so just use HwSrcVt.
23870 SDValue InputVT = DAG.getValueType(SrcVT);
23871 if (SrcVT.isFloatingPoint())
23872 InputVT = DAG.getValueType(HwSrcVt);
23873
23874 SDVTList VTs = DAG.getVTList(MVT::Other);
23875 SDValue SrcNew;
23876
23877 if (Src.getValueType().isFloatingPoint())
23878 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23879 else
23880 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23881
23882 SDValue Ops[] = {N->getOperand(0), // Chain
23883 SrcNew,
23884 N->getOperand(3), // Pg
23885 Base,
23886 Offset,
23887 InputVT};
23888
23889 return DAG.getNode(Opcode, DL, VTs, Ops);
23890}
23891
23893 unsigned Opcode,
23894 bool OnlyPackedOffsets = true) {
23895 const EVT RetVT = N->getValueType(0);
23896 assert(RetVT.isScalableVector() &&
23897 "Gather loads are only possible for SVE vectors");
23898
23899 SDLoc DL(N);
23900
23901 // Make sure that the loaded data will fit into an SVE register
23903 return SDValue();
23904
23905 // Depending on the addressing mode, this is either a pointer or a vector of
23906 // pointers (that fits into one register)
23907 SDValue Base = N->getOperand(3);
23908 // Depending on the addressing mode, this is either a single offset or a
23909 // vector of offsets (that fits into one register)
23910 SDValue Offset = N->getOperand(4);
23911
23912 // For "scalar + vector of indices", scale the indices to obtain unscaled
23913 // offsets. This applies to non-temporal and quadword gathers, which do not
23914 // have an addressing mode with scaled offset.
23917 RetVT.getScalarSizeInBits());
23919 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23921 RetVT.getScalarSizeInBits());
23923 }
23924
23925 // In the case of non-temporal gather loads and quadword gather loads there's
23926 // only one addressing mode : "vector + scalar", e.g.
23927 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23928 // Since we do have intrinsics that allow the arguments to be in a different
23929 // order, we may need to swap them to match the spec.
23930 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23931 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23932 Offset.getValueType().isVector())
23934
23935 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23936 // * a multiple of #SizeInBytes,
23937 // * in the range [0, 31 x #SizeInBytes],
23938 // where #SizeInBytes is the size in bytes of the loaded items. For
23939 // immediates outside that range and non-immediate scalar offsets use
23940 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23941 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23944 RetVT.getScalarSizeInBits() / 8)) {
23945 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23946 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23949 else
23950 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23953
23955 }
23956 }
23957
23958 auto &TLI = DAG.getTargetLoweringInfo();
23959 if (!TLI.isTypeLegal(Base.getValueType()))
23960 return SDValue();
23961
23962 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23963 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23964 // nxv2i64. Legalize accordingly.
23965 if (!OnlyPackedOffsets &&
23966 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23967 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23968
23969 // Return value type that is representable in hardware
23970 EVT HwRetVt = getSVEContainerType(RetVT);
23971
23972 // Keep the original output value type around - this is needed to be able to
23973 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23974 // values we want the integer equivalent, so just use HwRetVT.
23975 SDValue OutVT = DAG.getValueType(RetVT);
23976 if (RetVT.isFloatingPoint())
23977 OutVT = DAG.getValueType(HwRetVt);
23978
23979 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23980 SDValue Ops[] = {N->getOperand(0), // Chain
23981 N->getOperand(2), // Pg
23982 Base, Offset, OutVT};
23983
23984 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23985 SDValue LoadChain = SDValue(Load.getNode(), 1);
23986
23987 if (RetVT.isInteger() && (RetVT != HwRetVt))
23988 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
23989
23990 // If the original return value was FP, bitcast accordingly. Doing it here
23991 // means that we can avoid adding TableGen patterns for FPs.
23992 if (RetVT.isFloatingPoint())
23993 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
23994
23995 return DAG.getMergeValues({Load, LoadChain}, DL);
23996}
23997
23998static SDValue
24000 SelectionDAG &DAG) {
24001 SDLoc DL(N);
24002 SDValue Src = N->getOperand(0);
24003 unsigned Opc = Src->getOpcode();
24004
24005 // Sign extend of an unsigned unpack -> signed unpack
24006 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24007
24008 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24010
24011 // Push the sign extend to the operand of the unpack
24012 // This is necessary where, for example, the operand of the unpack
24013 // is another unpack:
24014 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24015 // ->
24016 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24017 // ->
24018 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24019 SDValue ExtOp = Src->getOperand(0);
24020 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24021 EVT EltTy = VT.getVectorElementType();
24022 (void)EltTy;
24023
24024 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24025 "Sign extending from an invalid type");
24026
24027 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24028
24030 ExtOp, DAG.getValueType(ExtVT));
24031
24032 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24033 }
24034
24035 if (DCI.isBeforeLegalizeOps())
24036 return SDValue();
24037
24039 return SDValue();
24040
24041 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24042 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24043 unsigned NewOpc;
24044 unsigned MemVTOpNum = 4;
24045 switch (Opc) {
24048 MemVTOpNum = 3;
24049 break;
24052 MemVTOpNum = 3;
24053 break;
24056 MemVTOpNum = 3;
24057 break;
24060 break;
24063 break;
24066 break;
24069 break;
24072 break;
24075 break;
24078 break;
24081 break;
24084 break;
24087 break;
24090 break;
24093 break;
24096 break;
24099 break;
24102 break;
24103 default:
24104 return SDValue();
24105 }
24106
24107 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24108 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24109
24110 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24111 return SDValue();
24112
24113 EVT DstVT = N->getValueType(0);
24114 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24115
24117 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24118 Ops.push_back(Src->getOperand(I));
24119
24120 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24121 DCI.CombineTo(N, ExtLoad);
24122 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24123
24124 // Return N so it doesn't get rechecked
24125 return SDValue(N, 0);
24126}
24127
24128/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24129/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24130/// != nxv2i32) do not need legalization.
24132 const unsigned OffsetPos = 4;
24133 SDValue Offset = N->getOperand(OffsetPos);
24134
24135 // Not an unpacked vector, bail out.
24136 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24137 return SDValue();
24138
24139 // Extend the unpacked offset vector to 64-bit lanes.
24140 SDLoc DL(N);
24141 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24142 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24143 // Replace the offset operand with the 64-bit one.
24144 Ops[OffsetPos] = Offset;
24145
24146 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24147}
24148
24149/// Combines a node carrying the intrinsic
24150/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24151/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24152/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24153/// sve gather prefetch instruction with vector plus immediate addressing mode.
24155 unsigned ScalarSizeInBytes) {
24156 const unsigned ImmPos = 4, OffsetPos = 3;
24157 // No need to combine the node if the immediate is valid...
24158 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24159 return SDValue();
24160
24161 // ...otherwise swap the offset base with the offset...
24162 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24163 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24164 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24165 // `aarch64_sve_prfb_gather_uxtw_index`.
24166 SDLoc DL(N);
24167 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24168 MVT::i64);
24169
24170 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24171}
24172
24173// Return true if the vector operation can guarantee only the first lane of its
24174// result contains data, with all bits in other lanes set to zero.
24176 switch (Op.getOpcode()) {
24177 default:
24178 return false;
24194 return true;
24195 }
24196}
24197
24199 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24200 SDValue InsertVec = N->getOperand(0);
24201 SDValue InsertElt = N->getOperand(1);
24202 SDValue InsertIdx = N->getOperand(2);
24203
24204 // We only care about inserts into the first element...
24205 if (!isNullConstant(InsertIdx))
24206 return SDValue();
24207 // ...of a zero'd vector...
24209 return SDValue();
24210 // ...where the inserted data was previously extracted...
24211 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24212 return SDValue();
24213
24214 SDValue ExtractVec = InsertElt.getOperand(0);
24215 SDValue ExtractIdx = InsertElt.getOperand(1);
24216
24217 // ...from the first element of a vector.
24218 if (!isNullConstant(ExtractIdx))
24219 return SDValue();
24220
24221 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24222
24223 // Ensure there's no type conversion going on.
24224 if (N->getValueType(0) != ExtractVec.getValueType())
24225 return SDValue();
24226
24227 if (!isLanes1toNKnownZero(ExtractVec))
24228 return SDValue();
24229
24230 // The explicit zeroing is redundant.
24231 return ExtractVec;
24232}
24233
24234static SDValue
24237 return Res;
24238
24239 return performPostLD1Combine(N, DCI, true);
24240}
24241
24243 EVT Ty = N->getValueType(0);
24244 if (Ty.isInteger())
24245 return SDValue();
24246
24249 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24251 return SDValue();
24252
24253 SDLoc DL(N);
24254 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
24255 DL, ExtIntTy);
24256 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
24257 DL, ExtIntTy);
24258 SDValue Idx = N->getOperand(2);
24259 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
24260 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
24261 return DAG.getBitcast(Ty, Trunc);
24262}
24263
24266 const AArch64Subtarget *Subtarget) {
24267 SDValue N0 = N->getOperand(0);
24268 EVT VT = N->getValueType(0);
24269
24270 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24271 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24272 return SDValue();
24273
24274 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24275 EVT EltVT = VT.getVectorElementType();
24276 return EltVT == MVT::f32 || EltVT == MVT::f64;
24277 };
24278
24279 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24280 // We purposefully don't care about legality of the nodes here as we know
24281 // they can be split down into something legal.
24282 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24283 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24284 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24285 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24286 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24287 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24288 LN0->getChain(), LN0->getBasePtr(),
24289 N0.getValueType(), LN0->getMemOperand());
24290 DCI.CombineTo(N, ExtLoad);
24291 DCI.CombineTo(
24292 N0.getNode(),
24293 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24294 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24295 ExtLoad.getValue(1));
24296 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24297 }
24298
24299 return SDValue();
24300}
24301
24303 const AArch64Subtarget *Subtarget) {
24304 EVT VT = N->getValueType(0);
24305
24306 // Don't expand for NEON, SVE2 or SME
24307 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24308 return SDValue();
24309
24310 SDLoc DL(N);
24311
24312 SDValue Mask = N->getOperand(0);
24313 SDValue In1 = N->getOperand(1);
24314 SDValue In2 = N->getOperand(2);
24315
24316 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24317 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24318 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24319 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24320}
24321
24323 EVT VT = N->getValueType(0);
24324
24325 SDValue Insert = N->getOperand(0);
24326 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24327 return SDValue();
24328
24329 if (!Insert.getOperand(0).isUndef())
24330 return SDValue();
24331
24332 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24333 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24334 if (IdxInsert != 0 || IdxDupLane != 0)
24335 return SDValue();
24336
24337 SDValue Bitcast = Insert.getOperand(1);
24338 if (Bitcast.getOpcode() != ISD::BITCAST)
24339 return SDValue();
24340
24341 SDValue Subvec = Bitcast.getOperand(0);
24342 EVT SubvecVT = Subvec.getValueType();
24343 if (!SubvecVT.is128BitVector())
24344 return SDValue();
24345 EVT NewSubvecVT =
24347
24348 SDLoc DL(N);
24349 SDValue NewInsert =
24350 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24351 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24352 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24353 NewInsert, N->getOperand(1));
24354 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24355}
24356
24357// Try to combine mull with uzp1.
24360 SelectionDAG &DAG) {
24361 if (DCI.isBeforeLegalizeOps())
24362 return SDValue();
24363
24364 SDValue LHS = N->getOperand(0);
24365 SDValue RHS = N->getOperand(1);
24366
24367 SDValue ExtractHigh;
24368 SDValue ExtractLow;
24369 SDValue TruncHigh;
24370 SDValue TruncLow;
24371 SDLoc DL(N);
24372
24373 // Check the operands are trunc and extract_high.
24375 RHS.getOpcode() == ISD::TRUNCATE) {
24376 TruncHigh = RHS;
24377 if (LHS.getOpcode() == ISD::BITCAST)
24378 ExtractHigh = LHS.getOperand(0);
24379 else
24380 ExtractHigh = LHS;
24382 LHS.getOpcode() == ISD::TRUNCATE) {
24383 TruncHigh = LHS;
24384 if (LHS.getOpcode() == ISD::BITCAST)
24385 ExtractHigh = RHS.getOperand(0);
24386 else
24387 ExtractHigh = RHS;
24388 } else
24389 return SDValue();
24390
24391 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24392 // with uzp1.
24393 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24394 SDValue TruncHighOp = TruncHigh.getOperand(0);
24395 EVT TruncHighOpVT = TruncHighOp.getValueType();
24396 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24397 DAG.isSplatValue(TruncHighOp, false))
24398 return SDValue();
24399
24400 // Check there is other extract_high with same source vector.
24401 // For example,
24402 //
24403 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24404 // t12: v4i16 = truncate t11
24405 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24406 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24407 // t16: v4i16 = truncate t15
24408 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24409 //
24410 // This dagcombine assumes the two extract_high uses same source vector in
24411 // order to detect the pair of the mull. If they have different source vector,
24412 // this code will not work.
24413 bool HasFoundMULLow = true;
24414 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24415 if (ExtractHighSrcVec->use_size() != 2)
24416 HasFoundMULLow = false;
24417
24418 // Find ExtractLow.
24419 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24420 if (User == ExtractHigh.getNode())
24421 continue;
24422
24423 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24425 HasFoundMULLow = false;
24426 break;
24427 }
24428
24429 ExtractLow.setNode(User);
24430 }
24431
24432 if (!ExtractLow || !ExtractLow->hasOneUse())
24433 HasFoundMULLow = false;
24434
24435 // Check ExtractLow's user.
24436 if (HasFoundMULLow) {
24437 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24438 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24439 HasFoundMULLow = false;
24440 } else {
24441 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24442 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24443 TruncLow = ExtractLowUser->getOperand(1);
24444 else
24445 HasFoundMULLow = false;
24446 } else {
24447 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24448 TruncLow = ExtractLowUser->getOperand(0);
24449 else
24450 HasFoundMULLow = false;
24451 }
24452 }
24453 }
24454
24455 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24456 // with uzp1.
24457 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24458 EVT TruncHighVT = TruncHigh.getValueType();
24459 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24460 SDValue TruncLowOp =
24461 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24462 EVT TruncLowOpVT = TruncLowOp.getValueType();
24463 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24464 DAG.isSplatValue(TruncLowOp, false)))
24465 return SDValue();
24466
24467 // Create uzp1, extract_high and extract_low.
24468 if (TruncHighOpVT != UZP1VT)
24469 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24470 if (TruncLowOpVT != UZP1VT)
24471 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24472
24473 SDValue UZP1 =
24474 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24475 SDValue HighIdxCst =
24476 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24477 SDValue NewTruncHigh =
24478 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24479 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24480
24481 if (HasFoundMULLow) {
24482 EVT TruncLowVT = TruncLow.getValueType();
24483 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24484 UZP1, ExtractLow.getOperand(1));
24485 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24486 }
24487
24488 return SDValue(N, 0);
24489}
24490
24493 SelectionDAG &DAG) {
24494 if (SDValue Val =
24496 return Val;
24497
24498 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24499 return Val;
24500
24501 return SDValue();
24502}
24503
24504static SDValue
24506 SelectionDAG &DAG) {
24507 // Let's do below transform.
24508 //
24509 // t34: v4i32 = AArch64ISD::UADDLV t2
24510 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24511 // t7: i64 = zero_extend t35
24512 // t20: v1i64 = scalar_to_vector t7
24513 // ==>
24514 // t34: v4i32 = AArch64ISD::UADDLV t2
24515 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24516 // t40: v1i64 = AArch64ISD::NVCAST t39
24517 if (DCI.isBeforeLegalizeOps())
24518 return SDValue();
24519
24520 EVT VT = N->getValueType(0);
24521 if (VT != MVT::v1i64)
24522 return SDValue();
24523
24524 SDValue ZEXT = N->getOperand(0);
24525 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24526 return SDValue();
24527
24528 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24529 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24530 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24531 return SDValue();
24532
24533 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24534 return SDValue();
24535
24536 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24537 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24538 UADDLV.getValueType() != MVT::v4i32 ||
24539 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24540 return SDValue();
24541
24542 // Let's generate new sequence with AArch64ISD::NVCAST.
24543 SDLoc DL(N);
24544 SDValue EXTRACT_SUBVEC =
24545 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24546 DAG.getConstant(0, DL, MVT::i64));
24547 SDValue NVCAST =
24548 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24549
24550 return NVCAST;
24551}
24552
24554 DAGCombinerInfo &DCI) const {
24555 SelectionDAG &DAG = DCI.DAG;
24556 switch (N->getOpcode()) {
24557 default:
24558 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24559 break;
24560 case ISD::VECREDUCE_AND:
24561 case ISD::VECREDUCE_OR:
24562 case ISD::VECREDUCE_XOR:
24563 return performVecReduceBitwiseCombine(N, DCI, DAG);
24564 case ISD::ADD:
24565 case ISD::SUB:
24566 return performAddSubCombine(N, DCI);
24567 case ISD::BUILD_VECTOR:
24568 return performBuildVectorCombine(N, DCI, DAG);
24569 case ISD::TRUNCATE:
24570 return performTruncateCombine(N, DAG);
24571 case AArch64ISD::ANDS:
24572 return performFlagSettingCombine(N, DCI, ISD::AND);
24573 case AArch64ISD::ADC:
24574 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24575 return R;
24576 return foldADCToCINC(N, DAG);
24577 case AArch64ISD::SBC:
24578 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24579 case AArch64ISD::ADCS:
24580 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24581 return R;
24583 case AArch64ISD::SBCS:
24584 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24585 return R;
24587 case AArch64ISD::BICi: {
24589 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
24590 APInt DemandedElts =
24591 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
24592
24594 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
24595 return SDValue();
24596
24597 break;
24598 }
24599 case ISD::XOR:
24600 return performXorCombine(N, DAG, DCI, Subtarget);
24601 case ISD::MUL:
24602 return performMulCombine(N, DAG, DCI, Subtarget);
24603 case ISD::SINT_TO_FP:
24604 case ISD::UINT_TO_FP:
24605 return performIntToFpCombine(N, DAG, Subtarget);
24606 case ISD::FP_TO_SINT:
24607 case ISD::FP_TO_UINT:
24610 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24611 case ISD::FDIV:
24612 return performFDivCombine(N, DAG, DCI, Subtarget);
24613 case ISD::OR:
24614 return performORCombine(N, DCI, Subtarget, *this);
24615 case ISD::AND:
24616 return performANDCombine(N, DCI);
24617 case ISD::FADD:
24618 return performFADDCombine(N, DCI);
24620 return performIntrinsicCombine(N, DCI, Subtarget);
24621 case ISD::ANY_EXTEND:
24622 case ISD::ZERO_EXTEND:
24623 case ISD::SIGN_EXTEND:
24624 return performExtendCombine(N, DCI, DAG);
24626 return performSignExtendInRegCombine(N, DCI, DAG);
24628 return performConcatVectorsCombine(N, DCI, DAG);
24630 return performExtractSubvectorCombine(N, DCI, DAG);
24632 return performInsertSubvectorCombine(N, DCI, DAG);
24633 case ISD::SELECT:
24634 return performSelectCombine(N, DCI);
24635 case ISD::VSELECT:
24636 return performVSelectCombine(N, DCI.DAG);
24637 case ISD::SETCC:
24638 return performSETCCCombine(N, DCI, DAG);
24639 case ISD::LOAD:
24640 return performLOADCombine(N, DCI, DAG, Subtarget);
24641 case ISD::STORE:
24642 return performSTORECombine(N, DCI, DAG, Subtarget);
24643 case ISD::MSTORE:
24644 return performMSTORECombine(N, DCI, DAG, Subtarget);
24645 case ISD::MGATHER:
24646 case ISD::MSCATTER:
24647 return performMaskedGatherScatterCombine(N, DCI, DAG);
24648 case ISD::VECTOR_SPLICE:
24649 return performSVESpliceCombine(N, DAG);
24650 case ISD::FP_EXTEND:
24651 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24652 case AArch64ISD::BRCOND:
24653 return performBRCONDCombine(N, DCI, DAG);
24654 case AArch64ISD::TBNZ:
24655 case AArch64ISD::TBZ:
24656 return performTBZCombine(N, DCI, DAG);
24657 case AArch64ISD::CSEL:
24658 return performCSELCombine(N, DCI, DAG);
24659 case AArch64ISD::DUP:
24664 return performDUPCombine(N, DCI);
24666 return performDupLane128Combine(N, DAG);
24667 case AArch64ISD::NVCAST:
24668 return performNVCASTCombine(N, DAG);
24669 case AArch64ISD::SPLICE:
24670 return performSpliceCombine(N, DAG);
24673 return performUnpackCombine(N, DAG, Subtarget);
24674 case AArch64ISD::UZP1:
24675 return performUzpCombine(N, DAG, Subtarget);
24677 return performSetccMergeZeroCombine(N, DCI);
24694 return performGLD1Combine(N, DAG);
24695 case AArch64ISD::VASHR:
24696 case AArch64ISD::VLSHR:
24697 return performVectorShiftCombine(N, *this, DCI);
24699 return performSunpkloCombine(N, DAG);
24700 case AArch64ISD::BSP:
24701 return performBSPExpandForSVE(N, DAG, Subtarget);
24703 return performInsertVectorEltCombine(N, DCI);
24705 return performExtractVectorEltCombine(N, DCI, Subtarget);
24706 case ISD::VECREDUCE_ADD:
24707 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24708 case AArch64ISD::UADDV:
24709 return performUADDVCombine(N, DAG);
24710 case AArch64ISD::SMULL:
24711 case AArch64ISD::UMULL:
24712 case AArch64ISD::PMULL:
24713 return performMULLCombine(N, DCI, DAG);
24716 switch (N->getConstantOperandVal(1)) {
24717 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24718 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24719 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24720 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24721 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24722 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24723 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24724 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24725 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24726 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24727 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24728 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24729 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24730 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24731 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24732 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24734 case Intrinsic::aarch64_neon_ld2:
24735 case Intrinsic::aarch64_neon_ld3:
24736 case Intrinsic::aarch64_neon_ld4:
24737 case Intrinsic::aarch64_neon_ld1x2:
24738 case Intrinsic::aarch64_neon_ld1x3:
24739 case Intrinsic::aarch64_neon_ld1x4:
24740 case Intrinsic::aarch64_neon_ld2lane:
24741 case Intrinsic::aarch64_neon_ld3lane:
24742 case Intrinsic::aarch64_neon_ld4lane:
24743 case Intrinsic::aarch64_neon_ld2r:
24744 case Intrinsic::aarch64_neon_ld3r:
24745 case Intrinsic::aarch64_neon_ld4r:
24746 case Intrinsic::aarch64_neon_st2:
24747 case Intrinsic::aarch64_neon_st3:
24748 case Intrinsic::aarch64_neon_st4:
24749 case Intrinsic::aarch64_neon_st1x2:
24750 case Intrinsic::aarch64_neon_st1x3:
24751 case Intrinsic::aarch64_neon_st1x4:
24752 case Intrinsic::aarch64_neon_st2lane:
24753 case Intrinsic::aarch64_neon_st3lane:
24754 case Intrinsic::aarch64_neon_st4lane:
24755 return performNEONPostLDSTCombine(N, DCI, DAG);
24756 case Intrinsic::aarch64_sve_ldnt1:
24757 return performLDNT1Combine(N, DAG);
24758 case Intrinsic::aarch64_sve_ld1rq:
24759 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24760 case Intrinsic::aarch64_sve_ld1ro:
24761 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24762 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24764 case Intrinsic::aarch64_sve_ldnt1_gather:
24766 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24767 return performGatherLoadCombine(N, DAG,
24769 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24771 case Intrinsic::aarch64_sve_ld1:
24773 case Intrinsic::aarch64_sve_ldnf1:
24775 case Intrinsic::aarch64_sve_ldff1:
24777 case Intrinsic::aarch64_sve_st1:
24778 return performST1Combine(N, DAG);
24779 case Intrinsic::aarch64_sve_stnt1:
24780 return performSTNT1Combine(N, DAG);
24781 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24783 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24785 case Intrinsic::aarch64_sve_stnt1_scatter:
24787 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24789 case Intrinsic::aarch64_sve_ld1_gather:
24791 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24792 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24794 case Intrinsic::aarch64_sve_ld1q_gather_index:
24795 return performGatherLoadCombine(N, DAG,
24797 case Intrinsic::aarch64_sve_ld1_gather_index:
24798 return performGatherLoadCombine(N, DAG,
24800 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24802 /*OnlyPackedOffsets=*/false);
24803 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24805 /*OnlyPackedOffsets=*/false);
24806 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24807 return performGatherLoadCombine(N, DAG,
24809 /*OnlyPackedOffsets=*/false);
24810 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24811 return performGatherLoadCombine(N, DAG,
24813 /*OnlyPackedOffsets=*/false);
24814 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24816 case Intrinsic::aarch64_sve_ldff1_gather:
24818 case Intrinsic::aarch64_sve_ldff1_gather_index:
24819 return performGatherLoadCombine(N, DAG,
24821 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24822 return performGatherLoadCombine(N, DAG,
24824 /*OnlyPackedOffsets=*/false);
24825 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24826 return performGatherLoadCombine(N, DAG,
24828 /*OnlyPackedOffsets=*/false);
24829 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24830 return performGatherLoadCombine(N, DAG,
24832 /*OnlyPackedOffsets=*/false);
24833 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24834 return performGatherLoadCombine(N, DAG,
24836 /*OnlyPackedOffsets=*/false);
24837 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24838 return performGatherLoadCombine(N, DAG,
24840 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24841 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24843 case Intrinsic::aarch64_sve_st1q_scatter_index:
24845 case Intrinsic::aarch64_sve_st1_scatter:
24847 case Intrinsic::aarch64_sve_st1_scatter_index:
24849 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24851 /*OnlyPackedOffsets=*/false);
24852 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24854 /*OnlyPackedOffsets=*/false);
24855 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24856 return performScatterStoreCombine(N, DAG,
24858 /*OnlyPackedOffsets=*/false);
24859 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24860 return performScatterStoreCombine(N, DAG,
24862 /*OnlyPackedOffsets=*/false);
24863 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24865 case Intrinsic::aarch64_rndr:
24866 case Intrinsic::aarch64_rndrrs: {
24867 unsigned IntrinsicID = N->getConstantOperandVal(1);
24868 auto Register =
24869 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24870 : AArch64SysReg::RNDRRS);
24871 SDLoc DL(N);
24872 SDValue A = DAG.getNode(
24873 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24874 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24875 SDValue B = DAG.getNode(
24876 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24877 DAG.getConstant(0, DL, MVT::i32),
24878 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24879 return DAG.getMergeValues(
24880 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24881 }
24882 case Intrinsic::aarch64_sme_ldr_zt:
24884 DAG.getVTList(MVT::Other), N->getOperand(0),
24885 N->getOperand(2), N->getOperand(3));
24886 case Intrinsic::aarch64_sme_str_zt:
24887 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24888 DAG.getVTList(MVT::Other), N->getOperand(0),
24889 N->getOperand(2), N->getOperand(3));
24890 default:
24891 break;
24892 }
24893 break;
24894 case ISD::GlobalAddress:
24895 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24896 case ISD::CTLZ:
24897 return performCTLZCombine(N, DAG, Subtarget);
24899 return performScalarToVectorCombine(N, DCI, DAG);
24900 }
24901 return SDValue();
24902}
24903
24904// Check if the return value is used as only a return value, as otherwise
24905// we can't perform a tail-call. In particular, we need to check for
24906// target ISD nodes that are returns and any other "odd" constructs
24907// that the generic analysis code won't necessarily catch.
24908bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24909 SDValue &Chain) const {
24910 if (N->getNumValues() != 1)
24911 return false;
24912 if (!N->hasNUsesOfValue(1, 0))
24913 return false;
24914
24915 SDValue TCChain = Chain;
24916 SDNode *Copy = *N->use_begin();
24917 if (Copy->getOpcode() == ISD::CopyToReg) {
24918 // If the copy has a glue operand, we conservatively assume it isn't safe to
24919 // perform a tail call.
24920 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24921 MVT::Glue)
24922 return false;
24923 TCChain = Copy->getOperand(0);
24924 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24925 return false;
24926
24927 bool HasRet = false;
24928 for (SDNode *Node : Copy->uses()) {
24929 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24930 return false;
24931 HasRet = true;
24932 }
24933
24934 if (!HasRet)
24935 return false;
24936
24937 Chain = TCChain;
24938 return true;
24939}
24940
24941// Return whether the an instruction can potentially be optimized to a tail
24942// call. This will cause the optimizers to attempt to move, or duplicate,
24943// return instructions to help enable tail call optimizations for this
24944// instruction.
24945bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24946 return CI->isTailCall();
24947}
24948
24949bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24950 Register Offset, bool IsPre,
24951 MachineRegisterInfo &MRI) const {
24952 auto CstOffset = getIConstantVRegVal(Offset, MRI);
24953 if (!CstOffset || CstOffset->isZero())
24954 return false;
24955
24956 // All of the indexed addressing mode instructions take a signed 9 bit
24957 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24958 // encodes the sign/indexing direction.
24959 return isInt<9>(CstOffset->getSExtValue());
24960}
24961
24962bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24963 SDValue &Base,
24964 SDValue &Offset,
24965 SelectionDAG &DAG) const {
24966 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24967 return false;
24968
24969 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24970 SDNode *ValOnlyUser = nullptr;
24971 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24972 ++UI) {
24973 if (UI.getUse().getResNo() == 1)
24974 continue; // Ignore chain.
24975 if (ValOnlyUser == nullptr)
24976 ValOnlyUser = *UI;
24977 else {
24978 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24979 break;
24980 }
24981 }
24982
24983 auto IsUndefOrZero = [](SDValue V) {
24984 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24985 };
24986
24987 // If the only user of the value is a scalable vector splat, it is
24988 // preferable to do a replicating load (ld1r*).
24989 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
24990 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24991 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24992 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
24993 return false;
24994
24995 Base = Op->getOperand(0);
24996 // All of the indexed addressing mode instructions take a signed
24997 // 9 bit immediate offset.
24998 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
24999 int64_t RHSC = RHS->getSExtValue();
25000 if (Op->getOpcode() == ISD::SUB)
25001 RHSC = -(uint64_t)RHSC;
25002 if (!isInt<9>(RHSC))
25003 return false;
25004 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25005 // when dealing with subtraction.
25006 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
25007 return true;
25008 }
25009 return false;
25010}
25011
25012bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25013 SDValue &Offset,
25015 SelectionDAG &DAG) const {
25016 EVT VT;
25017 SDValue Ptr;
25018 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25019 VT = LD->getMemoryVT();
25020 Ptr = LD->getBasePtr();
25021 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25022 VT = ST->getMemoryVT();
25023 Ptr = ST->getBasePtr();
25024 } else
25025 return false;
25026
25027 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25028 return false;
25029 AM = ISD::PRE_INC;
25030 return true;
25031}
25032
25033bool AArch64TargetLowering::getPostIndexedAddressParts(
25035 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25036 EVT VT;
25037 SDValue Ptr;
25038 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25039 VT = LD->getMemoryVT();
25040 Ptr = LD->getBasePtr();
25041 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25042 VT = ST->getMemoryVT();
25043 Ptr = ST->getBasePtr();
25044 } else
25045 return false;
25046
25047 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25048 return false;
25049 // Post-indexing updates the base, so it's not a valid transform
25050 // if that's not the same as the load's pointer.
25051 if (Ptr != Base)
25052 return false;
25053 AM = ISD::POST_INC;
25054 return true;
25055}
25056
25059 SelectionDAG &DAG) {
25060 SDLoc DL(N);
25061 SDValue Op = N->getOperand(0);
25062 EVT VT = N->getValueType(0);
25063 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25064 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25065 "Must be bool vector.");
25066
25067 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25068 // elements, it adds a vector concatenation with undef(s). If we encounter
25069 // this here, we can skip the concat.
25070 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25071 bool AllUndef = true;
25072 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25073 AllUndef &= Op.getOperand(I).isUndef();
25074
25075 if (AllUndef)
25076 Op = Op.getOperand(0);
25077 }
25078
25079 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25080 if (VectorBits)
25081 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25082}
25083
25086 SelectionDAG &DAG, EVT ExtendVT,
25087 EVT CastVT) {
25088 SDLoc DL(N);
25089 SDValue Op = N->getOperand(0);
25090 EVT VT = N->getValueType(0);
25091
25092 // Use SCALAR_TO_VECTOR for lane zero
25093 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25094 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25095 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25096 Results.push_back(
25097 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25098}
25099
25100void AArch64TargetLowering::ReplaceBITCASTResults(
25102 SDLoc DL(N);
25103 SDValue Op = N->getOperand(0);
25104 EVT VT = N->getValueType(0);
25105 EVT SrcVT = Op.getValueType();
25106
25107 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25108 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25109 return;
25110 }
25111
25112 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25113 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25114 return;
25115 }
25116
25117 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25118 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25119 return;
25120 }
25121
25122 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25123 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25124 "Expected fp->int bitcast!");
25125
25126 // Bitcasting between unpacked vector types of different element counts is
25127 // not a NOP because the live elements are laid out differently.
25128 // 01234567
25129 // e.g. nxv2i32 = XX??XX??
25130 // nxv4f16 = X?X?X?X?
25131 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25132 return;
25133
25134 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25135 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25136 return;
25137 }
25138
25139 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25140 !VT.isVector())
25141 return replaceBoolVectorBitcast(N, Results, DAG);
25142
25143 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25144 return;
25145
25146 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25147 DAG.getUNDEF(MVT::i32), Op);
25148 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25149 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25150}
25151
25153 SelectionDAG &DAG,
25154 const AArch64Subtarget *Subtarget) {
25155 EVT VT = N->getValueType(0);
25156 if (!VT.is256BitVector() ||
25158 !N->getFlags().hasAllowReassociation()) ||
25159 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25160 VT.getScalarType() == MVT::bf16)
25161 return;
25162
25163 SDValue X = N->getOperand(0);
25164 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25165 if (!Shuf) {
25166 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25167 X = N->getOperand(1);
25168 if (!Shuf)
25169 return;
25170 }
25171
25172 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25173 return;
25174
25175 // Check the mask is 1,0,3,2,5,4,...
25176 ArrayRef<int> Mask = Shuf->getMask();
25177 for (int I = 0, E = Mask.size(); I < E; I++)
25178 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25179 return;
25180
25181 SDLoc DL(N);
25182 auto LoHi = DAG.SplitVector(X, DL);
25183 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25184 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25185 LoHi.first, LoHi.second);
25186
25187 // Shuffle the elements back into order.
25188 SmallVector<int> NMask;
25189 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25190 NMask.push_back(I);
25191 NMask.push_back(I);
25192 }
25193 Results.push_back(
25194 DAG.getVectorShuffle(VT, DL,
25195 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25196 DAG.getUNDEF(LoHi.first.getValueType())),
25197 DAG.getUNDEF(VT), NMask));
25198}
25199
25202 SelectionDAG &DAG, unsigned InterOp,
25203 unsigned AcrossOp) {
25204 EVT LoVT, HiVT;
25205 SDValue Lo, Hi;
25206 SDLoc dl(N);
25207 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25208 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25209 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25210 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25211 Results.push_back(SplitVal);
25212}
25213
25214void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25216 SDValue In = N->getOperand(0);
25217 EVT InVT = In.getValueType();
25218
25219 // Common code will handle these just fine.
25220 if (!InVT.isScalableVector() || !InVT.isInteger())
25221 return;
25222
25223 SDLoc DL(N);
25224 EVT VT = N->getValueType(0);
25225
25226 // The following checks bail if this is not a halving operation.
25227
25229
25230 if (InVT.getVectorElementCount() != (ResEC * 2))
25231 return;
25232
25233 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25234 if (!CIndex)
25235 return;
25236
25237 unsigned Index = CIndex->getZExtValue();
25238 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25239 return;
25240
25241 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25242 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25243
25244 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25245 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25246}
25247
25248// Create an even/odd pair of X registers holding integer value V.
25250 SDLoc dl(V.getNode());
25251 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25252 if (DAG.getDataLayout().isBigEndian())
25253 std::swap (VLo, VHi);
25254 SDValue RegClass =
25255 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25256 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25257 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25258 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25259 return SDValue(
25260 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25261}
25262
25265 SelectionDAG &DAG,
25266 const AArch64Subtarget *Subtarget) {
25267 assert(N->getValueType(0) == MVT::i128 &&
25268 "AtomicCmpSwap on types less than 128 should be legal");
25269
25270 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25271 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25272 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25273 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25274 SDValue Ops[] = {
25275 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25276 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25277 N->getOperand(1), // Ptr
25278 N->getOperand(0), // Chain in
25279 };
25280
25281 unsigned Opcode;
25282 switch (MemOp->getMergedOrdering()) {
25284 Opcode = AArch64::CASPX;
25285 break;
25287 Opcode = AArch64::CASPAX;
25288 break;
25290 Opcode = AArch64::CASPLX;
25291 break;
25294 Opcode = AArch64::CASPALX;
25295 break;
25296 default:
25297 llvm_unreachable("Unexpected ordering!");
25298 }
25299
25300 MachineSDNode *CmpSwap = DAG.getMachineNode(
25301 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25302 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25303
25304 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25305 if (DAG.getDataLayout().isBigEndian())
25306 std::swap(SubReg1, SubReg2);
25307 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25308 SDValue(CmpSwap, 0));
25309 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25310 SDValue(CmpSwap, 0));
25311 Results.push_back(
25312 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25313 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25314 return;
25315 }
25316
25317 unsigned Opcode;
25318 switch (MemOp->getMergedOrdering()) {
25320 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25321 break;
25323 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25324 break;
25326 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25327 break;
25330 Opcode = AArch64::CMP_SWAP_128;
25331 break;
25332 default:
25333 llvm_unreachable("Unexpected ordering!");
25334 }
25335
25336 SDLoc DL(N);
25337 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25338 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25339 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25340 New.first, New.second, N->getOperand(0)};
25341 SDNode *CmpSwap = DAG.getMachineNode(
25342 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25343 Ops);
25344 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25345
25346 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25347 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25348 Results.push_back(SDValue(CmpSwap, 3));
25349}
25350
25351static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25352 AtomicOrdering Ordering) {
25353 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25354 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25355 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25356 // ATOMIC_LOAD_CLR at any point.
25357 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25358 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25359 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25360 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25361
25362 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25363 // The operand will need to be XORed in a separate step.
25364 switch (Ordering) {
25366 return AArch64::LDCLRP;
25367 break;
25369 return AArch64::LDCLRPA;
25370 break;
25372 return AArch64::LDCLRPL;
25373 break;
25376 return AArch64::LDCLRPAL;
25377 break;
25378 default:
25379 llvm_unreachable("Unexpected ordering!");
25380 }
25381 }
25382
25383 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25384 switch (Ordering) {
25386 return AArch64::LDSETP;
25387 break;
25389 return AArch64::LDSETPA;
25390 break;
25392 return AArch64::LDSETPL;
25393 break;
25396 return AArch64::LDSETPAL;
25397 break;
25398 default:
25399 llvm_unreachable("Unexpected ordering!");
25400 }
25401 }
25402
25403 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25404 switch (Ordering) {
25406 return AArch64::SWPP;
25407 break;
25409 return AArch64::SWPPA;
25410 break;
25412 return AArch64::SWPPL;
25413 break;
25416 return AArch64::SWPPAL;
25417 break;
25418 default:
25419 llvm_unreachable("Unexpected ordering!");
25420 }
25421 }
25422
25423 llvm_unreachable("Unexpected ISDOpcode!");
25424}
25425
25428 SelectionDAG &DAG,
25429 const AArch64Subtarget *Subtarget) {
25430 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25431 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25432 // rather than the CASP instructions, because CASP has register classes for
25433 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25434 // to present them as single operands. LSE128 instructions use the GPR64
25435 // register class (because the pair does not have to be sequential), like
25436 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25437
25438 assert(N->getValueType(0) == MVT::i128 &&
25439 "AtomicLoadXXX on types less than 128 should be legal");
25440
25441 if (!Subtarget->hasLSE128())
25442 return;
25443
25444 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25445 const SDValue &Chain = N->getOperand(0);
25446 const SDValue &Ptr = N->getOperand(1);
25447 const SDValue &Val128 = N->getOperand(2);
25448 std::pair<SDValue, SDValue> Val2x64 =
25449 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25450
25451 const unsigned ISDOpcode = N->getOpcode();
25452 const unsigned MachineOpcode =
25453 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25454
25455 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25456 SDLoc dl(Val128);
25457 Val2x64.first =
25458 DAG.getNode(ISD::XOR, dl, MVT::i64,
25459 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25460 Val2x64.second =
25461 DAG.getNode(ISD::XOR, dl, MVT::i64,
25462 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25463 }
25464
25465 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25466 if (DAG.getDataLayout().isBigEndian())
25467 std::swap(Ops[0], Ops[1]);
25468
25469 MachineSDNode *AtomicInst =
25470 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25471 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25472
25473 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25474
25475 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25476 if (DAG.getDataLayout().isBigEndian())
25477 std::swap(Lo, Hi);
25478
25479 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25480 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25481}
25482
25483void AArch64TargetLowering::ReplaceNodeResults(
25485 switch (N->getOpcode()) {
25486 default:
25487 llvm_unreachable("Don't know how to custom expand this");
25488 case ISD::BITCAST:
25489 ReplaceBITCASTResults(N, Results, DAG);
25490 return;
25491 case ISD::VECREDUCE_ADD:
25496 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25497 return;
25498 case ISD::ADD:
25499 case ISD::FADD:
25500 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25501 return;
25502
25503 case ISD::CTPOP:
25504 case ISD::PARITY:
25505 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25506 Results.push_back(Result);
25507 return;
25508 case AArch64ISD::SADDV:
25510 return;
25511 case AArch64ISD::UADDV:
25513 return;
25514 case AArch64ISD::SMINV:
25516 return;
25517 case AArch64ISD::UMINV:
25519 return;
25520 case AArch64ISD::SMAXV:
25522 return;
25523 case AArch64ISD::UMAXV:
25525 return;
25526 case ISD::MULHS:
25528 Results.push_back(
25529 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25530 return;
25531 case ISD::MULHU:
25533 Results.push_back(
25534 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25535 return;
25536 case ISD::FP_TO_UINT:
25537 case ISD::FP_TO_SINT:
25540 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25541 // Let normal code take care of it by not adding anything to Results.
25542 return;
25544 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25545 return;
25547 assert(N->getValueType(0) != MVT::i128 &&
25548 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25549 break;
25552 case ISD::ATOMIC_SWAP: {
25553 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25554 "Expected 128-bit atomicrmw.");
25555 // These need custom type legalisation so we go directly to instruction.
25556 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25557 return;
25558 }
25559 case ISD::ATOMIC_LOAD:
25560 case ISD::LOAD: {
25561 MemSDNode *LoadNode = cast<MemSDNode>(N);
25562 EVT MemVT = LoadNode->getMemoryVT();
25563 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25564 // targets.
25565 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25566 MemVT.getSizeInBits() == 256u &&
25567 (MemVT.getScalarSizeInBits() == 8u ||
25568 MemVT.getScalarSizeInBits() == 16u ||
25569 MemVT.getScalarSizeInBits() == 32u ||
25570 MemVT.getScalarSizeInBits() == 64u)) {
25571
25574 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25575 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25576 MVT::Other}),
25577 {LoadNode->getChain(), LoadNode->getBasePtr()},
25578 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25579
25580 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25581 Result.getValue(0), Result.getValue(1));
25582 Results.append({Pair, Result.getValue(2) /* Chain */});
25583 return;
25584 }
25585
25586 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25587 LoadNode->getMemoryVT() != MVT::i128) {
25588 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25589 // optimizer.
25590 return;
25591 }
25592
25593 if (SDValue(N, 0).getValueType() == MVT::i128) {
25594 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25595 bool isLoadAcquire =
25597 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25598
25599 if (isLoadAcquire)
25600 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25601
25603 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25604 {LoadNode->getChain(), LoadNode->getBasePtr()},
25605 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25606
25607 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25608
25609 SDValue Pair =
25610 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25611 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25612 Results.append({Pair, Result.getValue(2) /* Chain */});
25613 }
25614 return;
25615 }
25617 ReplaceExtractSubVectorResults(N, Results, DAG);
25618 return;
25621 // Custom lowering has been requested for INSERT_SUBVECTOR and
25622 // CONCAT_VECTORS -- but delegate to common code for result type
25623 // legalisation
25624 return;
25626 EVT VT = N->getValueType(0);
25627
25628 Intrinsic::ID IntID =
25629 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25630 switch (IntID) {
25631 default:
25632 return;
25633 case Intrinsic::aarch64_sve_clasta_n: {
25634 assert((VT == MVT::i8 || VT == MVT::i16) &&
25635 "custom lowering for unexpected type");
25636 SDLoc DL(N);
25637 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25638 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25639 N->getOperand(1), Op2, N->getOperand(3));
25640 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25641 return;
25642 }
25643 case Intrinsic::aarch64_sve_clastb_n: {
25644 assert((VT == MVT::i8 || VT == MVT::i16) &&
25645 "custom lowering for unexpected type");
25646 SDLoc DL(N);
25647 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25648 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25649 N->getOperand(1), Op2, N->getOperand(3));
25650 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25651 return;
25652 }
25653 case Intrinsic::aarch64_sve_lasta: {
25654 assert((VT == MVT::i8 || VT == MVT::i16) &&
25655 "custom lowering for unexpected type");
25656 SDLoc DL(N);
25657 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25658 N->getOperand(1), N->getOperand(2));
25659 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25660 return;
25661 }
25662 case Intrinsic::aarch64_sve_lastb: {
25663 assert((VT == MVT::i8 || VT == MVT::i16) &&
25664 "custom lowering for unexpected type");
25665 SDLoc DL(N);
25666 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25667 N->getOperand(1), N->getOperand(2));
25668 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25669 return;
25670 }
25671 case Intrinsic::get_active_lane_mask: {
25672 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
25673 return;
25674
25675 // NOTE: Only trivial type promotion is supported.
25676 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
25677 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
25678 return;
25679
25680 SDLoc DL(N);
25681 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
25682 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25683 return;
25684 }
25685 }
25686 }
25687 case ISD::READ_REGISTER: {
25688 SDLoc DL(N);
25689 assert(N->getValueType(0) == MVT::i128 &&
25690 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25691 SDValue Chain = N->getOperand(0);
25692 SDValue SysRegName = N->getOperand(1);
25693
25694 SDValue Result = DAG.getNode(
25695 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25696 Chain, SysRegName);
25697
25698 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25699 // of the 128-bit System Register value.
25700 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25701 Result.getValue(0), Result.getValue(1));
25702 Results.push_back(Pair);
25703 Results.push_back(Result.getValue(2)); // Chain
25704 return;
25705 }
25706 }
25707}
25708
25710 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25712 return true;
25713}
25714
25715unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25716 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25717 // reciprocal if there are three or more FDIVs.
25718 return 3;
25719}
25720
25723 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25724 // v4i16, v2i32 instead of to promote.
25725 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25726 VT == MVT::v1f32)
25727 return TypeWidenVector;
25728
25730}
25731
25732// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25733// provided the address is 16-byte aligned.
25735 if (!Subtarget->hasLSE2())
25736 return false;
25737
25738 if (auto LI = dyn_cast<LoadInst>(I))
25739 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25740 LI->getAlign() >= Align(16);
25741
25742 if (auto SI = dyn_cast<StoreInst>(I))
25743 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25744 SI->getAlign() >= Align(16);
25745
25746 return false;
25747}
25748
25750 if (!Subtarget->hasLSE128())
25751 return false;
25752
25753 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25754 // will clobber the two registers.
25755 if (const auto *SI = dyn_cast<StoreInst>(I))
25756 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25757 SI->getAlign() >= Align(16) &&
25758 (SI->getOrdering() == AtomicOrdering::Release ||
25759 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25760
25761 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25762 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25763 RMW->getAlign() >= Align(16) &&
25764 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25765 RMW->getOperation() == AtomicRMWInst::And ||
25766 RMW->getOperation() == AtomicRMWInst::Or);
25767
25768 return false;
25769}
25770
25772 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25773 return false;
25774
25775 if (auto LI = dyn_cast<LoadInst>(I))
25776 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25777 LI->getAlign() >= Align(16) &&
25778 LI->getOrdering() == AtomicOrdering::Acquire;
25779
25780 if (auto SI = dyn_cast<StoreInst>(I))
25781 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25782 SI->getAlign() >= Align(16) &&
25783 SI->getOrdering() == AtomicOrdering::Release;
25784
25785 return false;
25786}
25787
25789 const Instruction *I) const {
25791 return false;
25793 return false;
25795 return true;
25796 return false;
25797}
25798
25800 const Instruction *I) const {
25801 // Store-Release instructions only provide seq_cst guarantees when paired with
25802 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25803 // implement seq_cst loads and stores, so we need additional explicit fences
25804 // after memory writes.
25805 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25806 return false;
25807
25808 switch (I->getOpcode()) {
25809 default:
25810 return false;
25811 case Instruction::AtomicCmpXchg:
25812 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25814 case Instruction::AtomicRMW:
25815 return cast<AtomicRMWInst>(I)->getOrdering() ==
25817 case Instruction::Store:
25818 return cast<StoreInst>(I)->getOrdering() ==
25820 }
25821}
25822
25823// Loads and stores less than 128-bits are already atomic; ones above that
25824// are doomed anyway, so defer to the default libcall and blame the OS when
25825// things go wrong.
25828 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25829 if (Size != 128)
25831 if (isOpSuitableForRCPC3(SI))
25833 if (isOpSuitableForLSE128(SI))
25835 if (isOpSuitableForLDPSTP(SI))
25838}
25839
25840// Loads and stores less than 128-bits are already atomic; ones above that
25841// are doomed anyway, so defer to the default libcall and blame the OS when
25842// things go wrong.
25845 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25846
25847 if (Size != 128)
25849 if (isOpSuitableForRCPC3(LI))
25851 // No LSE128 loads
25852 if (isOpSuitableForLDPSTP(LI))
25854
25855 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25856 // implement atomicrmw without spilling. If the target address is also on the
25857 // stack and close enough to the spill slot, this can lead to a situation
25858 // where the monitor always gets cleared and the atomic operation can never
25859 // succeed. So at -O0 lower this operation to a CAS loop.
25860 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25862
25863 // Using CAS for an atomic load has a better chance of succeeding under high
25864 // contention situations. So use it if available.
25865 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25867}
25868
25869// The "default" for integer RMW operations is to expand to an LL/SC loop.
25870// However, with the LSE instructions (or outline-atomics mode, which provides
25871// library routines in place of the LSE-instructions), we can directly emit many
25872// operations instead.
25873//
25874// Floating-point operations are always emitted to a cmpxchg loop, because they
25875// may trigger a trap which aborts an LLSC sequence.
25878 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25879 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25880
25881 if (AI->isFloatingPointOperation())
25883
25884 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25888 if (CanUseLSE128)
25890
25891 // Nand is not supported in LSE.
25892 // Leave 128 bits to LLSC or CmpXChg.
25893 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25894 if (Subtarget->hasLSE())
25896 if (Subtarget->outlineAtomics()) {
25897 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25898 // Don't outline them unless
25899 // (1) high level <atomic> support approved:
25900 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25901 // (2) low level libgcc and compiler-rt support implemented by:
25902 // min/max outline atomics helpers
25903 if (AI->getOperation() != AtomicRMWInst::Min &&
25908 }
25909 }
25910 }
25911
25912 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25913 // implement atomicrmw without spilling. If the target address is also on the
25914 // stack and close enough to the spill slot, this can lead to a situation
25915 // where the monitor always gets cleared and the atomic operation can never
25916 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25917 // we have a single CAS instruction that can replace the loop.
25919 Subtarget->hasLSE())
25921
25923}
25924
25927 AtomicCmpXchgInst *AI) const {
25928 // If subtarget has LSE, leave cmpxchg intact for codegen.
25929 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25931 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25932 // implement cmpxchg without spilling. If the address being exchanged is also
25933 // on the stack and close enough to the spill slot, this can lead to a
25934 // situation where the monitor always gets cleared and the atomic operation
25935 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25936 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25938
25939 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25940 // it.
25942 if (Size > 64)
25944
25946}
25947
25949 Type *ValueTy, Value *Addr,
25950 AtomicOrdering Ord) const {
25951 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25952 bool IsAcquire = isAcquireOrStronger(Ord);
25953
25954 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25955 // intrinsic must return {i64, i64} and we have to recombine them into a
25956 // single i128 here.
25957 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25959 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25961
25962 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25963
25964 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25965 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25966 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25967 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25968 return Builder.CreateOr(
25969 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25970 }
25971
25972 Type *Tys[] = { Addr->getType() };
25974 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25975 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25976
25977 const DataLayout &DL = M->getDataLayout();
25978 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25979 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25980 CI->addParamAttr(
25981 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25982 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25983
25984 return Builder.CreateBitCast(Trunc, ValueTy);
25985}
25986
25988 IRBuilderBase &Builder) const {
25989 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25990 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25991}
25992
25994 Value *Val, Value *Addr,
25995 AtomicOrdering Ord) const {
25996 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25997 bool IsRelease = isReleaseOrStronger(Ord);
25998
25999 // Since the intrinsics must have legal type, the i128 intrinsics take two
26000 // parameters: "i64, i64". We must marshal Val into the appropriate form
26001 // before the call.
26002 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
26004 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26006 Type *Int64Ty = Type::getInt64Ty(M->getContext());
26007
26008 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26009 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26010 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
26011 }
26012
26014 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26015 Type *Tys[] = { Addr->getType() };
26016 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
26017
26018 const DataLayout &DL = M->getDataLayout();
26019 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
26020 Val = Builder.CreateBitCast(Val, IntValTy);
26021
26022 CallInst *CI = Builder.CreateCall(
26023 Stxr, {Builder.CreateZExtOrBitCast(
26024 Val, Stxr->getFunctionType()->getParamType(0)),
26025 Addr});
26026 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
26027 Attribute::ElementType, Val->getType()));
26028 return CI;
26029}
26030
26032 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26033 const DataLayout &DL) const {
26034 if (!Ty->isArrayTy()) {
26035 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26036 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26037 }
26038
26039 // All non aggregate members of the type must have the same type
26040 SmallVector<EVT> ValueVTs;
26041 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26042 return all_equal(ValueVTs);
26043}
26044
26045bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26046 EVT) const {
26047 return false;
26048}
26049
26050static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26051 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26052 Function *ThreadPointerFunc =
26053 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26054 return IRB.CreatePointerCast(
26055 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26056 Offset),
26057 IRB.getPtrTy(0));
26058}
26059
26061 // Android provides a fixed TLS slot for the stack cookie. See the definition
26062 // of TLS_SLOT_STACK_GUARD in
26063 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26064 if (Subtarget->isTargetAndroid())
26065 return UseTlsOffset(IRB, 0x28);
26066
26067 // Fuchsia is similar.
26068 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26069 if (Subtarget->isTargetFuchsia())
26070 return UseTlsOffset(IRB, -0x10);
26071
26073}
26074
26076 // MSVC CRT provides functionalities for stack protection.
26077 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26078 // MSVC CRT has a global variable holding security cookie.
26079 M.getOrInsertGlobal("__security_cookie",
26080 PointerType::getUnqual(M.getContext()));
26081
26082 // MSVC CRT has a function to validate security cookie.
26083 FunctionCallee SecurityCheckCookie =
26084 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26085 Type::getVoidTy(M.getContext()),
26086 PointerType::getUnqual(M.getContext()));
26087 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26088 F->setCallingConv(CallingConv::Win64);
26089 F->addParamAttr(0, Attribute::AttrKind::InReg);
26090 }
26091 return;
26092 }
26094}
26095
26097 // MSVC CRT has a global variable holding security cookie.
26098 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26099 return M.getGlobalVariable("__security_cookie");
26101}
26102
26104 // MSVC CRT has a function to validate security cookie.
26105 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26106 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26108}
26109
26110Value *
26112 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26113 // definition of TLS_SLOT_SAFESTACK in
26114 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26115 if (Subtarget->isTargetAndroid())
26116 return UseTlsOffset(IRB, 0x48);
26117
26118 // Fuchsia is similar.
26119 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26120 if (Subtarget->isTargetFuchsia())
26121 return UseTlsOffset(IRB, -0x8);
26122
26124}
26125
26127 const Instruction &AndI) const {
26128 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26129 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26130 // may be beneficial to sink in other cases, but we would have to check that
26131 // the cmp would not get folded into the br to form a cbz for these to be
26132 // beneficial.
26133 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26134 if (!Mask)
26135 return false;
26136 return Mask->getValue().isPowerOf2();
26137}
26138
26142 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26143 SelectionDAG &DAG) const {
26144 // Does baseline recommend not to perform the fold by default?
26146 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26147 return false;
26148 // Else, if this is a vector shift, prefer 'shl'.
26149 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26150}
26151
26154 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26156 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26159 ExpansionFactor);
26160}
26161
26163 // Update IsSplitCSR in AArch64unctionInfo.
26164 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26165 AFI->setIsSplitCSR(true);
26166}
26167
26169 MachineBasicBlock *Entry,
26170 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26171 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26172 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26173 if (!IStart)
26174 return;
26175
26176 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26177 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26178 MachineBasicBlock::iterator MBBI = Entry->begin();
26179 for (const MCPhysReg *I = IStart; *I; ++I) {
26180 const TargetRegisterClass *RC = nullptr;
26181 if (AArch64::GPR64RegClass.contains(*I))
26182 RC = &AArch64::GPR64RegClass;
26183 else if (AArch64::FPR64RegClass.contains(*I))
26184 RC = &AArch64::FPR64RegClass;
26185 else
26186 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26187
26188 Register NewVR = MRI->createVirtualRegister(RC);
26189 // Create copy from CSR to a virtual register.
26190 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26191 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26192 // nounwind. If we want to generalize this later, we may need to emit
26193 // CFI pseudo-instructions.
26194 assert(Entry->getParent()->getFunction().hasFnAttribute(
26195 Attribute::NoUnwind) &&
26196 "Function should be nounwind in insertCopiesSplitCSR!");
26197 Entry->addLiveIn(*I);
26198 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26199 .addReg(*I);
26200
26201 // Insert the copy-back instructions right before the terminator.
26202 for (auto *Exit : Exits)
26203 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26204 TII->get(TargetOpcode::COPY), *I)
26205 .addReg(NewVR);
26206 }
26207}
26208
26210 // Integer division on AArch64 is expensive. However, when aggressively
26211 // optimizing for code size, we prefer to use a div instruction, as it is
26212 // usually smaller than the alternative sequence.
26213 // The exception to this is vector division. Since AArch64 doesn't have vector
26214 // integer division, leaving the division as-is is a loss even in terms of
26215 // size, because it will have to be scalarized, while the alternative code
26216 // sequence can be performed in vector form.
26217 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26218 return OptSize && !VT.isVector();
26219}
26220
26222 // We want inc-of-add for scalars and sub-of-not for vectors.
26223 return VT.isScalarInteger();
26224}
26225
26227 EVT VT) const {
26228 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26229 // legalize.
26230 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26231 return false;
26232 if (FPVT == MVT::v8bf16)
26233 return false;
26234 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26235}
26236
26240 const TargetInstrInfo *TII) const {
26241 assert(MBBI->isCall() && MBBI->getCFIType() &&
26242 "Invalid call instruction for a KCFI check");
26243
26244 switch (MBBI->getOpcode()) {
26245 case AArch64::BLR:
26246 case AArch64::BLRNoIP:
26247 case AArch64::TCRETURNri:
26248 case AArch64::TCRETURNrix16x17:
26249 case AArch64::TCRETURNrix17:
26250 case AArch64::TCRETURNrinotx16:
26251 break;
26252 default:
26253 llvm_unreachable("Unexpected CFI call opcode");
26254 }
26255
26256 MachineOperand &Target = MBBI->getOperand(0);
26257 assert(Target.isReg() && "Invalid target operand for an indirect call");
26258 Target.setIsRenamable(false);
26259
26260 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26261 .addReg(Target.getReg())
26262 .addImm(MBBI->getCFIType())
26263 .getInstr();
26264}
26265
26267 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26268}
26269
26270unsigned
26272 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26273 return getPointerTy(DL).getSizeInBits();
26274
26275 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26276}
26277
26278void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26279 MachineFrameInfo &MFI = MF.getFrameInfo();
26280 // If we have any vulnerable SVE stack objects then the stack protector
26281 // needs to be placed at the top of the SVE stack area, as the SVE locals
26282 // are placed above the other locals, so we allocate it as if it were a
26283 // scalable vector.
26284 // FIXME: It may be worthwhile having a specific interface for this rather
26285 // than doing it here in finalizeLowering.
26286 if (MFI.hasStackProtectorIndex()) {
26287 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26293 break;
26294 }
26295 }
26296 }
26299}
26300
26301// Unlike X86, we let frame lowering assign offsets to all catch objects.
26303 return false;
26304}
26305
26306bool AArch64TargetLowering::shouldLocalize(
26307 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26308 auto &MF = *MI.getMF();
26309 auto &MRI = MF.getRegInfo();
26310 auto maxUses = [](unsigned RematCost) {
26311 // A cost of 1 means remats are basically free.
26312 if (RematCost == 1)
26313 return std::numeric_limits<unsigned>::max();
26314 if (RematCost == 2)
26315 return 2U;
26316
26317 // Remat is too expensive, only sink if there's one user.
26318 if (RematCost > 2)
26319 return 1U;
26320 llvm_unreachable("Unexpected remat cost");
26321 };
26322
26323 unsigned Opc = MI.getOpcode();
26324 switch (Opc) {
26325 case TargetOpcode::G_GLOBAL_VALUE: {
26326 // On Darwin, TLS global vars get selected into function calls, which
26327 // we don't want localized, as they can get moved into the middle of a
26328 // another call sequence.
26329 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26330 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26331 return false;
26332 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26333 }
26334 case TargetOpcode::G_FCONSTANT:
26335 case TargetOpcode::G_CONSTANT: {
26336 const ConstantInt *CI;
26337 unsigned AdditionalCost = 0;
26338
26339 if (Opc == TargetOpcode::G_CONSTANT)
26340 CI = MI.getOperand(1).getCImm();
26341 else {
26342 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26343 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26344 // materialized as integers.
26345 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26346 break;
26347 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26348 bool OptForSize =
26351 OptForSize))
26352 return true; // Constant should be cheap.
26353 CI =
26354 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26355 // FP materialization also costs an extra move, from gpr to fpr.
26356 AdditionalCost = 1;
26357 }
26358 APInt Imm = CI->getValue();
26361 assert(Cost.isValid() && "Expected a valid imm cost");
26362
26363 unsigned RematCost = *Cost.getValue();
26364 RematCost += AdditionalCost;
26365 Register Reg = MI.getOperand(0).getReg();
26366 unsigned MaxUses = maxUses(RematCost);
26367 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26368 if (MaxUses == std::numeric_limits<unsigned>::max())
26369 --MaxUses;
26370 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26371 }
26372 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26373 // localizable.
26374 case AArch64::ADRP:
26375 case AArch64::G_ADD_LOW:
26376 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26377 case TargetOpcode::G_PTR_ADD:
26378 return true;
26379 default:
26380 break;
26381 }
26383}
26384
26386 if (Inst.getType()->isScalableTy())
26387 return true;
26388
26389 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26390 if (Inst.getOperand(i)->getType()->isScalableTy())
26391 return true;
26392
26393 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26394 if (AI->getAllocatedType()->isScalableTy())
26395 return true;
26396 }
26397
26398 // Checks to allow the use of SME instructions
26399 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26400 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26401 auto CalleeAttrs = SMEAttrs(*Base);
26402 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26403 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26404 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26405 return true;
26406 }
26407 return false;
26408}
26409
26410// Return the largest legal scalable vector type that matches VT's element type.
26414 "Expected legal fixed length vector!");
26415 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26416 default:
26417 llvm_unreachable("unexpected element type for SVE container");
26418 case MVT::i8:
26419 return EVT(MVT::nxv16i8);
26420 case MVT::i16:
26421 return EVT(MVT::nxv8i16);
26422 case MVT::i32:
26423 return EVT(MVT::nxv4i32);
26424 case MVT::i64:
26425 return EVT(MVT::nxv2i64);
26426 case MVT::bf16:
26427 return EVT(MVT::nxv8bf16);
26428 case MVT::f16:
26429 return EVT(MVT::nxv8f16);
26430 case MVT::f32:
26431 return EVT(MVT::nxv4f32);
26432 case MVT::f64:
26433 return EVT(MVT::nxv2f64);
26434 }
26435}
26436
26437// Return a PTRUE with active lanes corresponding to the extent of VT.
26439 EVT VT) {
26442 "Expected legal fixed length vector!");
26443
26444 std::optional<unsigned> PgPattern =
26446 assert(PgPattern && "Unexpected element count for SVE predicate");
26447
26448 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26449 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26450 // variants of instructions when available.
26451 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26452 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26453 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26454 if (MaxSVESize && MinSVESize == MaxSVESize &&
26455 MaxSVESize == VT.getSizeInBits())
26456 PgPattern = AArch64SVEPredPattern::all;
26457
26458 MVT MaskVT;
26459 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26460 default:
26461 llvm_unreachable("unexpected element type for SVE predicate");
26462 case MVT::i8:
26463 MaskVT = MVT::nxv16i1;
26464 break;
26465 case MVT::i16:
26466 case MVT::f16:
26467 case MVT::bf16:
26468 MaskVT = MVT::nxv8i1;
26469 break;
26470 case MVT::i32:
26471 case MVT::f32:
26472 MaskVT = MVT::nxv4i1;
26473 break;
26474 case MVT::i64:
26475 case MVT::f64:
26476 MaskVT = MVT::nxv2i1;
26477 break;
26478 }
26479
26480 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26481}
26482
26484 EVT VT) {
26486 "Expected legal scalable vector!");
26487 auto PredTy = VT.changeVectorElementType(MVT::i1);
26488 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26489}
26490
26492 if (VT.isFixedLengthVector())
26493 return getPredicateForFixedLengthVector(DAG, DL, VT);
26494
26495 return getPredicateForScalableVector(DAG, DL, VT);
26496}
26497
26498// Grow V to consume an entire SVE register.
26500 assert(VT.isScalableVector() &&
26501 "Expected to convert into a scalable vector!");
26502 assert(V.getValueType().isFixedLengthVector() &&
26503 "Expected a fixed length vector operand!");
26504 SDLoc DL(V);
26505 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26506 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26507}
26508
26509// Shrink V so it's just big enough to maintain a VT's worth of data.
26512 "Expected to convert into a fixed length vector!");
26513 assert(V.getValueType().isScalableVector() &&
26514 "Expected a scalable vector operand!");
26515 SDLoc DL(V);
26516 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26517 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26518}
26519
26520// Convert all fixed length vector loads larger than NEON to masked_loads.
26521SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26522 SDValue Op, SelectionDAG &DAG) const {
26523 auto Load = cast<LoadSDNode>(Op);
26524
26525 SDLoc DL(Op);
26526 EVT VT = Op.getValueType();
26527 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26528 EVT LoadVT = ContainerVT;
26529 EVT MemVT = Load->getMemoryVT();
26530
26531 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26532
26533 if (VT.isFloatingPoint()) {
26534 LoadVT = ContainerVT.changeTypeToInteger();
26535 MemVT = MemVT.changeTypeToInteger();
26536 }
26537
26538 SDValue NewLoad = DAG.getMaskedLoad(
26539 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26540 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26541 Load->getAddressingMode(), Load->getExtensionType());
26542
26543 SDValue Result = NewLoad;
26544 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26545 EVT ExtendVT = ContainerVT.changeVectorElementType(
26546 Load->getMemoryVT().getVectorElementType());
26547
26548 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26550 Pg, Result, DAG.getUNDEF(ContainerVT));
26551 } else if (VT.isFloatingPoint()) {
26552 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26553 }
26554
26555 Result = convertFromScalableVector(DAG, VT, Result);
26556 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26557 return DAG.getMergeValues(MergedValues, DL);
26558}
26559
26561 SelectionDAG &DAG) {
26562 SDLoc DL(Mask);
26563 EVT InVT = Mask.getValueType();
26564 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26565
26566 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26567
26568 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26569 return Pg;
26570
26571 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26572 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26573
26575 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26576}
26577
26578// Convert all fixed length vector loads larger than NEON to masked_loads.
26579SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26580 SDValue Op, SelectionDAG &DAG) const {
26581 auto Load = cast<MaskedLoadSDNode>(Op);
26582
26583 SDLoc DL(Op);
26584 EVT VT = Op.getValueType();
26585 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26586
26587 SDValue Mask = Load->getMask();
26588 // If this is an extending load and the mask type is not the same as
26589 // load's type then we have to extend the mask type.
26590 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26591 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26592 "Incorrect mask type");
26593 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26594 }
26596
26597 SDValue PassThru;
26598 bool IsPassThruZeroOrUndef = false;
26599
26600 if (Load->getPassThru()->isUndef()) {
26601 PassThru = DAG.getUNDEF(ContainerVT);
26602 IsPassThruZeroOrUndef = true;
26603 } else {
26604 if (ContainerVT.isInteger())
26605 PassThru = DAG.getConstant(0, DL, ContainerVT);
26606 else
26607 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26608 if (isZerosVector(Load->getPassThru().getNode()))
26609 IsPassThruZeroOrUndef = true;
26610 }
26611
26612 SDValue NewLoad = DAG.getMaskedLoad(
26613 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26614 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26615 Load->getAddressingMode(), Load->getExtensionType());
26616
26617 SDValue Result = NewLoad;
26618 if (!IsPassThruZeroOrUndef) {
26619 SDValue OldPassThru =
26620 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26621 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26622 }
26623
26624 Result = convertFromScalableVector(DAG, VT, Result);
26625 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26626 return DAG.getMergeValues(MergedValues, DL);
26627}
26628
26629// Convert all fixed length vector stores larger than NEON to masked_stores.
26630SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26631 SDValue Op, SelectionDAG &DAG) const {
26632 auto Store = cast<StoreSDNode>(Op);
26633
26634 SDLoc DL(Op);
26635 EVT VT = Store->getValue().getValueType();
26636 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26637 EVT MemVT = Store->getMemoryVT();
26638
26639 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26640 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26641
26642 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26643 EVT TruncVT = ContainerVT.changeVectorElementType(
26644 Store->getMemoryVT().getVectorElementType());
26645 MemVT = MemVT.changeTypeToInteger();
26646 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26647 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26648 DAG.getUNDEF(TruncVT));
26649 NewValue =
26650 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26651 } else if (VT.isFloatingPoint()) {
26652 MemVT = MemVT.changeTypeToInteger();
26653 NewValue =
26654 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26655 }
26656
26657 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26658 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26659 Store->getMemOperand(), Store->getAddressingMode(),
26660 Store->isTruncatingStore());
26661}
26662
26663SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26664 SDValue Op, SelectionDAG &DAG) const {
26665 auto *Store = cast<MaskedStoreSDNode>(Op);
26666
26667 SDLoc DL(Op);
26668 EVT VT = Store->getValue().getValueType();
26669 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26670
26671 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26673
26674 return DAG.getMaskedStore(
26675 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26676 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26677 Store->getAddressingMode(), Store->isTruncatingStore());
26678}
26679
26680SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26681 SDValue Op, SelectionDAG &DAG) const {
26682 SDLoc dl(Op);
26683 EVT VT = Op.getValueType();
26684 EVT EltVT = VT.getVectorElementType();
26685
26686 bool Signed = Op.getOpcode() == ISD::SDIV;
26687 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26688
26689 bool Negated;
26690 uint64_t SplatVal;
26691 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26692 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26693 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26694 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26695
26696 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26697 SDValue Res =
26698 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26699 if (Negated)
26700 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26701 DAG.getConstant(0, dl, ContainerVT), Res);
26702
26703 return convertFromScalableVector(DAG, VT, Res);
26704 }
26705
26706 // Scalable vector i32/i64 DIV is supported.
26707 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26708 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26709
26710 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26711 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26712 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26713 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26714
26715 // If the wider type is legal: extend, op, and truncate.
26716 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26717 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26718 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26719 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26720 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26721 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26722 }
26723
26724 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26725 &ExtendOpcode](SDValue Op) {
26726 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26727 SDValue IdxHalf =
26728 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26729 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26730 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26731 return std::pair<SDValue, SDValue>(
26732 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26733 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26734 };
26735
26736 // If wider type is not legal: split, extend, op, trunc and concat.
26737 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26738 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26739 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26740 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26741 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26742 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26743 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26744}
26745
26746SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26747 SDValue Op, SelectionDAG &DAG) const {
26748 EVT VT = Op.getValueType();
26749 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26750
26751 SDLoc DL(Op);
26752 SDValue Val = Op.getOperand(0);
26753 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26754 Val = convertToScalableVector(DAG, ContainerVT, Val);
26755
26756 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26757 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26758
26759 // Repeatedly unpack Val until the result is of the desired element type.
26760 switch (ContainerVT.getSimpleVT().SimpleTy) {
26761 default:
26762 llvm_unreachable("unimplemented container type");
26763 case MVT::nxv16i8:
26764 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26765 if (VT.getVectorElementType() == MVT::i16)
26766 break;
26767 [[fallthrough]];
26768 case MVT::nxv8i16:
26769 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26770 if (VT.getVectorElementType() == MVT::i32)
26771 break;
26772 [[fallthrough]];
26773 case MVT::nxv4i32:
26774 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26775 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26776 break;
26777 }
26778
26779 return convertFromScalableVector(DAG, VT, Val);
26780}
26781
26782SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26783 SDValue Op, SelectionDAG &DAG) const {
26784 EVT VT = Op.getValueType();
26785 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26786
26787 SDLoc DL(Op);
26788 SDValue Val = Op.getOperand(0);
26789 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26790 Val = convertToScalableVector(DAG, ContainerVT, Val);
26791
26792 // Repeatedly truncate Val until the result is of the desired element type.
26793 switch (ContainerVT.getSimpleVT().SimpleTy) {
26794 default:
26795 llvm_unreachable("unimplemented container type");
26796 case MVT::nxv2i64:
26797 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26798 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26799 if (VT.getVectorElementType() == MVT::i32)
26800 break;
26801 [[fallthrough]];
26802 case MVT::nxv4i32:
26803 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26804 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26805 if (VT.getVectorElementType() == MVT::i16)
26806 break;
26807 [[fallthrough]];
26808 case MVT::nxv8i16:
26809 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26810 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26811 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26812 break;
26813 }
26814
26815 return convertFromScalableVector(DAG, VT, Val);
26816}
26817
26818SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26819 SDValue Op, SelectionDAG &DAG) const {
26820 EVT VT = Op.getValueType();
26821 EVT InVT = Op.getOperand(0).getValueType();
26822 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26823
26824 SDLoc DL(Op);
26825 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26826 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26827
26828 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26829}
26830
26831SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26832 SDValue Op, SelectionDAG &DAG) const {
26833 EVT VT = Op.getValueType();
26834 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26835
26836 SDLoc DL(Op);
26837 EVT InVT = Op.getOperand(0).getValueType();
26838 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26839 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26840
26841 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26842 Op.getOperand(1), Op.getOperand(2));
26843
26844 return convertFromScalableVector(DAG, VT, ScalableRes);
26845}
26846
26847// Convert vector operation 'Op' to an equivalent predicated operation whereby
26848// the original operation's type is used to construct a suitable predicate.
26849// NOTE: The results for inactive lanes are undefined.
26850SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26851 SelectionDAG &DAG,
26852 unsigned NewOp) const {
26853 EVT VT = Op.getValueType();
26854 SDLoc DL(Op);
26855 auto Pg = getPredicateForVector(DAG, DL, VT);
26856
26857 if (VT.isFixedLengthVector()) {
26858 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26859 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26860
26861 // Create list of operands by converting existing ones to scalable types.
26863 for (const SDValue &V : Op->op_values()) {
26864 if (isa<CondCodeSDNode>(V)) {
26865 Operands.push_back(V);
26866 continue;
26867 }
26868
26869 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26870 EVT VTArg = VTNode->getVT().getVectorElementType();
26871 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26872 Operands.push_back(DAG.getValueType(NewVTArg));
26873 continue;
26874 }
26875
26876 assert(isTypeLegal(V.getValueType()) &&
26877 "Expected only legal fixed-width types");
26878 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26879 }
26880
26881 if (isMergePassthruOpcode(NewOp))
26882 Operands.push_back(DAG.getUNDEF(ContainerVT));
26883
26884 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26885 return convertFromScalableVector(DAG, VT, ScalableRes);
26886 }
26887
26888 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26889
26891 for (const SDValue &V : Op->op_values()) {
26892 assert((!V.getValueType().isVector() ||
26893 V.getValueType().isScalableVector()) &&
26894 "Only scalable vectors are supported!");
26895 Operands.push_back(V);
26896 }
26897
26898 if (isMergePassthruOpcode(NewOp))
26899 Operands.push_back(DAG.getUNDEF(VT));
26900
26901 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26902}
26903
26904// If a fixed length vector operation has no side effects when applied to
26905// undefined elements, we can safely use scalable vectors to perform the same
26906// operation without needing to worry about predication.
26907SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26908 SelectionDAG &DAG) const {
26909 EVT VT = Op.getValueType();
26911 "Only expected to lower fixed length vector operation!");
26912 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26913
26914 // Create list of operands by converting existing ones to scalable types.
26916 for (const SDValue &V : Op->op_values()) {
26917 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26918
26919 // Pass through non-vector operands.
26920 if (!V.getValueType().isVector()) {
26921 Ops.push_back(V);
26922 continue;
26923 }
26924
26925 // "cast" fixed length vector to a scalable vector.
26926 assert(V.getValueType().isFixedLengthVector() &&
26927 isTypeLegal(V.getValueType()) &&
26928 "Only fixed length vectors are supported!");
26929 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26930 }
26931
26932 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26933 return convertFromScalableVector(DAG, VT, ScalableRes);
26934}
26935
26936SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26937 SelectionDAG &DAG) const {
26938 SDLoc DL(ScalarOp);
26939 SDValue AccOp = ScalarOp.getOperand(0);
26940 SDValue VecOp = ScalarOp.getOperand(1);
26941 EVT SrcVT = VecOp.getValueType();
26942 EVT ResVT = SrcVT.getVectorElementType();
26943
26944 EVT ContainerVT = SrcVT;
26945 if (SrcVT.isFixedLengthVector()) {
26946 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26947 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26948 }
26949
26950 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26951 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26952
26953 // Convert operands to Scalable.
26954 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26955 DAG.getUNDEF(ContainerVT), AccOp, Zero);
26956
26957 // Perform reduction.
26958 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26959 Pg, AccOp, VecOp);
26960
26961 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26962}
26963
26964SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26965 SelectionDAG &DAG) const {
26966 SDLoc DL(ReduceOp);
26967 SDValue Op = ReduceOp.getOperand(0);
26968 EVT OpVT = Op.getValueType();
26969 EVT VT = ReduceOp.getValueType();
26970
26971 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26972 return SDValue();
26973
26974 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26975
26976 switch (ReduceOp.getOpcode()) {
26977 default:
26978 return SDValue();
26979 case ISD::VECREDUCE_OR:
26980 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26981 // The predicate can be 'Op' because
26982 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26983 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26984 else
26985 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26986 case ISD::VECREDUCE_AND: {
26987 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26988 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26989 }
26990 case ISD::VECREDUCE_XOR: {
26991 SDValue ID =
26992 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26993 if (OpVT == MVT::nxv1i1) {
26994 // Emulate a CNTP on .Q using .D and a different governing predicate.
26995 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26996 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26997 }
26998 SDValue Cntp =
26999 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
27000 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
27001 }
27002 }
27003
27004 return SDValue();
27005}
27006
27007SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27008 SDValue ScalarOp,
27009 SelectionDAG &DAG) const {
27010 SDLoc DL(ScalarOp);
27011 SDValue VecOp = ScalarOp.getOperand(0);
27012 EVT SrcVT = VecOp.getValueType();
27013
27015 SrcVT,
27016 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27017 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27018 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27019 }
27020
27021 // UADDV always returns an i64 result.
27022 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27023 SrcVT.getVectorElementType();
27024 EVT RdxVT = SrcVT;
27025 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27026 RdxVT = getPackedSVEVectorVT(ResVT);
27027
27028 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27029 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
27030 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
27031 Rdx, DAG.getConstant(0, DL, MVT::i64));
27032
27033 // The VEC_REDUCE nodes expect an element size result.
27034 if (ResVT != ScalarOp.getValueType())
27035 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27036
27037 return Res;
27038}
27039
27040SDValue
27041AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27042 SelectionDAG &DAG) const {
27043 EVT VT = Op.getValueType();
27044 SDLoc DL(Op);
27045
27046 EVT InVT = Op.getOperand(1).getValueType();
27047 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27048 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27049 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27050
27051 // Convert the mask to a predicated (NOTE: We don't need to worry about
27052 // inactive lanes since VSELECT is safe when given undefined elements).
27053 EVT MaskVT = Op.getOperand(0).getValueType();
27054 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27055 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27057 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27058
27059 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27060 Mask, Op1, Op2);
27061
27062 return convertFromScalableVector(DAG, VT, ScalableRes);
27063}
27064
27065SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27066 SDValue Op, SelectionDAG &DAG) const {
27067 SDLoc DL(Op);
27068 EVT InVT = Op.getOperand(0).getValueType();
27069 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27070
27071 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27072 "Only expected to lower fixed length vector operation!");
27073 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27074 "Expected integer result of the same bit length as the inputs!");
27075
27076 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27077 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27078 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27079
27080 EVT CmpVT = Pg.getValueType();
27081 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27082 {Pg, Op1, Op2, Op.getOperand(2)});
27083
27084 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27085 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27086 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27087}
27088
27089SDValue
27090AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27091 SelectionDAG &DAG) const {
27092 SDLoc DL(Op);
27093 auto SrcOp = Op.getOperand(0);
27094 EVT VT = Op.getValueType();
27095 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27096 EVT ContainerSrcVT =
27097 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27098
27099 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27100 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27101 return convertFromScalableVector(DAG, VT, Op);
27102}
27103
27104SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27105 SDValue Op, SelectionDAG &DAG) const {
27106 SDLoc DL(Op);
27107 unsigned NumOperands = Op->getNumOperands();
27108
27109 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27110 "Unexpected number of operands in CONCAT_VECTORS");
27111
27112 auto SrcOp1 = Op.getOperand(0);
27113 auto SrcOp2 = Op.getOperand(1);
27114 EVT VT = Op.getValueType();
27115 EVT SrcVT = SrcOp1.getValueType();
27116
27117 if (NumOperands > 2) {
27119 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27120 for (unsigned I = 0; I < NumOperands; I += 2)
27121 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27122 Op->getOperand(I), Op->getOperand(I + 1)));
27123
27124 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27125 }
27126
27127 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27128
27130 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27131 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27132
27133 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27134
27135 return convertFromScalableVector(DAG, VT, Op);
27136}
27137
27138SDValue
27139AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27140 SelectionDAG &DAG) const {
27141 EVT VT = Op.getValueType();
27142 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27143
27144 SDLoc DL(Op);
27145 SDValue Val = Op.getOperand(0);
27146 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27147 EVT SrcVT = Val.getValueType();
27148 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27149 EVT ExtendVT = ContainerVT.changeVectorElementType(
27150 SrcVT.getVectorElementType());
27151
27152 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27153 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27154
27155 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27156 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27157 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27158 Pg, Val, DAG.getUNDEF(ContainerVT));
27159
27160 return convertFromScalableVector(DAG, VT, Val);
27161}
27162
27163SDValue
27164AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27165 SelectionDAG &DAG) const {
27166 EVT VT = Op.getValueType();
27167 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27168
27169 SDLoc DL(Op);
27170 SDValue Val = Op.getOperand(0);
27171 EVT SrcVT = Val.getValueType();
27172 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27173 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27175 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27176
27177 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27178 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27179 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27180 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27181 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27182
27183 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27184 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27185}
27186
27187SDValue
27188AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27189 SelectionDAG &DAG) const {
27190 EVT VT = Op.getValueType();
27191 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27192
27193 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27194 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27196
27197 SDLoc DL(Op);
27198 SDValue Val = Op.getOperand(0);
27199 EVT SrcVT = Val.getValueType();
27200 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27201 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27202
27203 if (VT.bitsGE(SrcVT)) {
27205
27206 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27207 VT.changeTypeToInteger(), Val);
27208
27209 // Safe to use a larger than specified operand because by promoting the
27210 // value nothing has changed from an arithmetic point of view.
27211 Val =
27212 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27213 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27214 DAG.getUNDEF(ContainerDstVT));
27215 return convertFromScalableVector(DAG, VT, Val);
27216 } else {
27217 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27218 ContainerDstVT.getVectorElementType());
27220
27221 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27222 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27223 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27224 Val = convertFromScalableVector(DAG, SrcVT, Val);
27225
27226 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27227 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27228 }
27229}
27230
27231SDValue
27232AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27233 SelectionDAG &DAG) const {
27234 SDLoc DL(Op);
27235 EVT OpVT = Op.getValueType();
27236 assert(OpVT.isScalableVector() &&
27237 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27238 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27239 Op.getOperand(1));
27240 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27241 Op.getOperand(1));
27242 return DAG.getMergeValues({Even, Odd}, DL);
27243}
27244
27245SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27246 SelectionDAG &DAG) const {
27247 SDLoc DL(Op);
27248 EVT OpVT = Op.getValueType();
27249 assert(OpVT.isScalableVector() &&
27250 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27251
27252 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27253 Op.getOperand(1));
27254 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27255 Op.getOperand(1));
27256 return DAG.getMergeValues({Lo, Hi}, DL);
27257}
27258
27259SDValue
27260AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27261 SelectionDAG &DAG) const {
27262 EVT VT = Op.getValueType();
27263 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27264
27265 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27266 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27268
27269 SDLoc DL(Op);
27270 SDValue Val = Op.getOperand(0);
27271 EVT SrcVT = Val.getValueType();
27272 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27273 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27274
27275 if (VT.bitsGT(SrcVT)) {
27276 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27277 ContainerSrcVT.getVectorElementType());
27279
27280 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27281 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27282
27283 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27284 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27285 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27286 DAG.getUNDEF(ContainerDstVT));
27287 return convertFromScalableVector(DAG, VT, Val);
27288 } else {
27289 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27291
27292 // Safe to use a larger than specified result since an fp_to_int where the
27293 // result doesn't fit into the destination is undefined.
27294 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27295 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27296 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27297
27298 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27299 }
27300}
27301
27303 ArrayRef<int> ShuffleMask, EVT VT,
27304 EVT ContainerVT, SelectionDAG &DAG) {
27305 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27306 SDLoc DL(Op);
27307 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27308 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27309 bool IsSingleOp =
27310 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27311
27312 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27313 MinSVESize = 128;
27314
27315 // Ignore two operands if no SVE2 or all index numbers couldn't
27316 // be represented.
27317 if (!IsSingleOp && !Subtarget.hasSVE2())
27318 return SDValue();
27319
27320 EVT VTOp1 = Op.getOperand(0).getValueType();
27321 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27322 unsigned IndexLen = MinSVESize / BitsPerElt;
27323 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27324 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27325 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27326 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27327 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27328 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27329 "Incorrectly legalised shuffle operation");
27330
27332 // If MinSVESize is not equal to MaxSVESize then we need to know which
27333 // TBL mask element needs adjustment.
27334 SmallVector<SDValue, 8> AddRuntimeVLMask;
27335
27336 // Bail out for 8-bits element types, because with 2048-bit SVE register
27337 // size 8 bits is only sufficient to index into the first source vector.
27338 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27339 return SDValue();
27340
27341 for (int Index : ShuffleMask) {
27342 // Handling poison index value.
27343 if (Index < 0)
27344 Index = 0;
27345 // If the mask refers to elements in the second operand, then we have to
27346 // offset the index by the number of elements in a vector. If this is number
27347 // is not known at compile-time, we need to maintain a mask with 'VL' values
27348 // to add at runtime.
27349 if ((unsigned)Index >= ElementsPerVectorReg) {
27350 if (MinMaxEqual) {
27351 Index += IndexLen - ElementsPerVectorReg;
27352 } else {
27353 Index = Index - ElementsPerVectorReg;
27354 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27355 }
27356 } else if (!MinMaxEqual)
27357 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27358 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27359 // to 255, this might point to the last element of in the second operand
27360 // of the shufflevector, thus we are rejecting this transform.
27361 if ((unsigned)Index >= MaxOffset)
27362 return SDValue();
27363 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27364 }
27365
27366 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27367 // value where it would perform first lane duplication for out of
27368 // index elements. For i8 elements an out-of-range index could be a valid
27369 // for 2048-bit vector register size.
27370 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27371 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27372 if (!MinMaxEqual)
27373 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27374 }
27375
27376 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27377 SDValue VecMask =
27378 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27379 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27380
27381 SDValue Shuffle;
27382 if (IsSingleOp)
27383 Shuffle =
27384 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27385 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27386 Op1, SVEMask);
27387 else if (Subtarget.hasSVE2()) {
27388 if (!MinMaxEqual) {
27389 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27390 SDValue VScale = (BitsPerElt == 64)
27391 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27392 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27393 SDValue VecMask =
27394 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27395 SDValue MulByMask = DAG.getNode(
27396 ISD::MUL, DL, MaskType,
27397 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27398 DAG.getBuildVector(MaskType, DL,
27399 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27400 SDValue UpdatedVecMask =
27401 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27402 SVEMask = convertToScalableVector(
27403 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27404 }
27405 Shuffle =
27406 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27407 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27408 Op1, Op2, SVEMask);
27409 }
27410 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27411 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27412}
27413
27414SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27415 SDValue Op, SelectionDAG &DAG) const {
27416 EVT VT = Op.getValueType();
27417 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27418
27419 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27420 auto ShuffleMask = SVN->getMask();
27421
27422 SDLoc DL(Op);
27423 SDValue Op1 = Op.getOperand(0);
27424 SDValue Op2 = Op.getOperand(1);
27425
27426 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27427 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27428 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27429
27430 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27431 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27432 return MVT::i32;
27433 return ScalarTy;
27434 };
27435
27436 if (SVN->isSplat()) {
27437 unsigned Lane = std::max(0, SVN->getSplatIndex());
27438 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27439 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27440 DAG.getConstant(Lane, DL, MVT::i64));
27441 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27442 return convertFromScalableVector(DAG, VT, Op);
27443 }
27444
27445 bool ReverseEXT = false;
27446 unsigned Imm;
27447 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27448 Imm == VT.getVectorNumElements() - 1) {
27449 if (ReverseEXT)
27450 std::swap(Op1, Op2);
27451 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27452 SDValue Scalar = DAG.getNode(
27453 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27454 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27455 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27456 return convertFromScalableVector(DAG, VT, Op);
27457 }
27458
27459 for (unsigned LaneSize : {64U, 32U, 16U}) {
27460 if (isREVMask(ShuffleMask, VT, LaneSize)) {
27461 EVT NewVT =
27463 unsigned RevOp;
27464 unsigned EltSz = VT.getScalarSizeInBits();
27465 if (EltSz == 8)
27467 else if (EltSz == 16)
27469 else
27471
27472 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27473 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27474 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27475 return convertFromScalableVector(DAG, VT, Op);
27476 }
27477 }
27478
27479 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
27480 isREVMask(ShuffleMask, VT, 128)) {
27481 if (!VT.isFloatingPoint())
27482 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27483
27485 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27486 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27487 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27488 return convertFromScalableVector(DAG, VT, Op);
27489 }
27490
27491 unsigned WhichResult;
27492 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27494 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27495
27496 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
27497 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27499 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27500 }
27501
27502 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27504 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27505
27506 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27507 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27509 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27510 }
27511
27512 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27513 // represents the same logical operation as performed by a ZIP instruction. In
27514 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27515 // equivalent to an AArch64 instruction. There's the extra component of
27516 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27517 // only operated on 64/128bit vector types that have a direct mapping to a
27518 // target register and so an exact mapping is implied.
27519 // However, when using SVE for fixed length vectors, most legal vector types
27520 // are actually sub-vectors of a larger SVE register. When mapping
27521 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27522 // how the mask's indices translate. Specifically, when the mapping requires
27523 // an exact meaning for a specific vector index (e.g. Index X is the last
27524 // vector element in the register) then such mappings are often only safe when
27525 // the exact SVE register size is know. The main exception to this is when
27526 // indices are logically relative to the first element of either
27527 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27528 // when converting from fixed-length to scalable vector types (i.e. the start
27529 // of a fixed length vector is always the start of a scalable vector).
27530 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27531 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27532 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27533 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27534 Op2.isUndef()) {
27535 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27536 return convertFromScalableVector(DAG, VT, Op);
27537 }
27538
27539 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27541 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27542
27543 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
27544 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27546 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27547 }
27548
27549 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27551 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27552
27553 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27554 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27556 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27557 }
27558 }
27559
27560 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27561 // unless NEON is not available and we can assume minimal SVE register size is
27562 // 128-bits.
27563 if (MinSVESize || !Subtarget->isNeonAvailable())
27564 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27565 DAG);
27566
27567 return SDValue();
27568}
27569
27570SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27571 SelectionDAG &DAG) const {
27572 SDLoc DL(Op);
27573 EVT InVT = Op.getValueType();
27574
27575 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27576 InVT.isScalableVector() && isTypeLegal(InVT) &&
27577 "Only expect to cast between legal scalable vector types!");
27578 assert(VT.getVectorElementType() != MVT::i1 &&
27579 InVT.getVectorElementType() != MVT::i1 &&
27580 "For predicate bitcasts, use getSVEPredicateBitCast");
27581
27582 if (InVT == VT)
27583 return Op;
27584
27586 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27587
27588 // Safe bitcasting between unpacked vector types of different element counts
27589 // is currently unsupported because the following is missing the necessary
27590 // work to ensure the result's elements live where they're supposed to within
27591 // an SVE register.
27592 // 01234567
27593 // e.g. nxv2i32 = XX??XX??
27594 // nxv4f16 = X?X?X?X?
27596 VT == PackedVT || InVT == PackedInVT) &&
27597 "Unexpected bitcast!");
27598
27599 // Pack input if required.
27600 if (InVT != PackedInVT)
27601 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27602
27603 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27604
27605 // Unpack result if required.
27606 if (VT != PackedVT)
27608
27609 return Op;
27610}
27611
27613 SDValue N) const {
27614 return ::isAllActivePredicate(DAG, N);
27615}
27616
27618 return ::getPromotedVTForPredicate(VT);
27619}
27620
27621bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27622 SDValue Op, const APInt &OriginalDemandedBits,
27623 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27624 unsigned Depth) const {
27625
27626 unsigned Opc = Op.getOpcode();
27627 switch (Opc) {
27628 case AArch64ISD::VSHL: {
27629 // Match (VSHL (VLSHR Val X) X)
27630 SDValue ShiftL = Op;
27631 SDValue ShiftR = Op->getOperand(0);
27632 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27633 return false;
27634
27635 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27636 return false;
27637
27638 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27639 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27640
27641 // Other cases can be handled as well, but this is not
27642 // implemented.
27643 if (ShiftRBits != ShiftLBits)
27644 return false;
27645
27646 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27647 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27648
27649 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27650 APInt UnusedBits = ~OriginalDemandedBits;
27651
27652 if ((ZeroBits & UnusedBits) != ZeroBits)
27653 return false;
27654
27655 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27656 // used - simplify to just Val.
27657 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27658 }
27659 case AArch64ISD::BICi: {
27660 // Fold BICi if all destination bits already known to be zeroed
27661 SDValue Op0 = Op.getOperand(0);
27662 KnownBits KnownOp0 =
27663 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
27664 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27665 uint64_t BitsToClear = Op->getConstantOperandVal(1)
27666 << Op->getConstantOperandVal(2);
27667 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27668 if (APInt(Known.getBitWidth(), BitsToClear)
27669 .isSubsetOf(AlreadyZeroedBitsToClear))
27670 return TLO.CombineTo(Op, Op0);
27671
27672 Known = KnownOp0 &
27673 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
27674
27675 return false;
27676 }
27678 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27679 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27680 if (!MaxSVEVectorSizeInBits)
27681 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27682 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27683 // The SVE count intrinsics don't support the multiplier immediate so we
27684 // don't have to account for that here. The value returned may be slightly
27685 // over the true required bits, as this is based on the "ALL" pattern. The
27686 // other patterns are also exposed by these intrinsics, but they all
27687 // return a value that's strictly less than "ALL".
27688 unsigned RequiredBits = llvm::bit_width(MaxElements);
27689 unsigned BitWidth = Known.Zero.getBitWidth();
27690 if (RequiredBits < BitWidth)
27691 Known.Zero.setHighBits(BitWidth - RequiredBits);
27692 return false;
27693 }
27694 }
27695 }
27696
27698 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27699}
27700
27701bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27702 return Op.getOpcode() == AArch64ISD::DUP ||
27703 Op.getOpcode() == AArch64ISD::MOVI ||
27704 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27705 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27707}
27708
27710 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27711 Subtarget->hasComplxNum();
27712}
27713
27716 auto *VTy = dyn_cast<VectorType>(Ty);
27717 if (!VTy)
27718 return false;
27719
27720 // If the vector is scalable, SVE is enabled, implying support for complex
27721 // numbers. Otherwise, we need to ensure complex number support is available
27722 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27723 return false;
27724
27725 auto *ScalarTy = VTy->getScalarType();
27726 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27727
27728 // We can only process vectors that have a bit size of 128 or higher (with an
27729 // additional 64 bits for Neon). Additionally, these vectors must have a
27730 // power-of-2 size, as we later split them into the smallest supported size
27731 // and merging them back together after applying complex operation.
27732 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27733 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27734 !llvm::isPowerOf2_32(VTyWidth))
27735 return false;
27736
27737 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27738 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27739 return 8 <= ScalarWidth && ScalarWidth <= 64;
27740 }
27741
27742 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27743 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27744}
27745
27748 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27749 Value *Accumulator) const {
27750 VectorType *Ty = cast<VectorType>(InputA->getType());
27751 bool IsScalable = Ty->isScalableTy();
27752 bool IsInt = Ty->getElementType()->isIntegerTy();
27753
27754 unsigned TyWidth =
27756
27757 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27758 "Vector type must be either 64 or a power of 2 that is at least 128");
27759
27760 if (TyWidth > 128) {
27761 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27762 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27763 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27764 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27765 auto *UpperSplitA =
27766 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27767 auto *UpperSplitB =
27768 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27769 Value *LowerSplitAcc = nullptr;
27770 Value *UpperSplitAcc = nullptr;
27771 if (Accumulator) {
27772 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27773 UpperSplitAcc =
27774 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27775 }
27776 auto *LowerSplitInt = createComplexDeinterleavingIR(
27777 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27778 auto *UpperSplitInt = createComplexDeinterleavingIR(
27779 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27780
27781 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27782 B.getInt64(0));
27783 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27784 }
27785
27786 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27787 if (Accumulator == nullptr)
27789
27790 if (IsScalable) {
27791 if (IsInt)
27792 return B.CreateIntrinsic(
27793 Intrinsic::aarch64_sve_cmla_x, Ty,
27794 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27795
27796 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27797 return B.CreateIntrinsic(
27798 Intrinsic::aarch64_sve_fcmla, Ty,
27799 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27800 }
27801
27802 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27803 Intrinsic::aarch64_neon_vcmla_rot90,
27804 Intrinsic::aarch64_neon_vcmla_rot180,
27805 Intrinsic::aarch64_neon_vcmla_rot270};
27806
27807
27808 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27809 {Accumulator, InputA, InputB});
27810 }
27811
27812 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27813 if (IsScalable) {
27816 if (IsInt)
27817 return B.CreateIntrinsic(
27818 Intrinsic::aarch64_sve_cadd_x, Ty,
27819 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27820
27821 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27822 return B.CreateIntrinsic(
27823 Intrinsic::aarch64_sve_fcadd, Ty,
27824 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27825 }
27826 return nullptr;
27827 }
27828
27831 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27833 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27834
27835 if (IntId == Intrinsic::not_intrinsic)
27836 return nullptr;
27837
27838 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27839 }
27840
27841 return nullptr;
27842}
27843
27844bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27845 unsigned Opc = N->getOpcode();
27846 if (ISD::isExtOpcode(Opc)) {
27847 if (any_of(N->uses(),
27848 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27849 return false;
27850 }
27851 return true;
27852}
27853
27854unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27855 return Subtarget->getMinimumJumpTableEntries();
27856}
27857
27860 EVT VT) const {
27861 bool NonUnitFixedLengthVector =
27863 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27865
27866 EVT VT1;
27867 MVT RegisterVT;
27868 unsigned NumIntermediates;
27869 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27870 RegisterVT);
27871 return RegisterVT;
27872}
27873
27875 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27876 bool NonUnitFixedLengthVector =
27878 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27880
27881 EVT VT1;
27882 MVT VT2;
27883 unsigned NumIntermediates;
27885 NumIntermediates, VT2);
27886}
27887
27889 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27890 unsigned &NumIntermediates, MVT &RegisterVT) const {
27892 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27893 if (!RegisterVT.isFixedLengthVector() ||
27894 RegisterVT.getFixedSizeInBits() <= 128)
27895 return NumRegs;
27896
27897 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27898 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27899 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27900
27901 // A size mismatch here implies either type promotion or widening and would
27902 // have resulted in scalarisation if larger vectors had not be available.
27903 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27904 EVT EltTy = VT.getVectorElementType();
27906 if (!isTypeLegal(NewVT))
27907 NewVT = EltTy;
27908
27909 IntermediateVT = NewVT;
27910 NumIntermediates = VT.getVectorNumElements();
27911 RegisterVT = getRegisterType(Context, NewVT);
27912 return NumIntermediates;
27913 }
27914
27915 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27916 // types for vector arguments and returns.
27917
27918 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27919 NumIntermediates *= NumSubRegs;
27920 NumRegs *= NumSubRegs;
27921
27922 switch (RegisterVT.getVectorElementType().SimpleTy) {
27923 default:
27924 llvm_unreachable("unexpected element type for vector");
27925 case MVT::i8:
27926 IntermediateVT = RegisterVT = MVT::v16i8;
27927 break;
27928 case MVT::i16:
27929 IntermediateVT = RegisterVT = MVT::v8i16;
27930 break;
27931 case MVT::i32:
27932 IntermediateVT = RegisterVT = MVT::v4i32;
27933 break;
27934 case MVT::i64:
27935 IntermediateVT = RegisterVT = MVT::v2i64;
27936 break;
27937 case MVT::f16:
27938 IntermediateVT = RegisterVT = MVT::v8f16;
27939 break;
27940 case MVT::f32:
27941 IntermediateVT = RegisterVT = MVT::v4f32;
27942 break;
27943 case MVT::f64:
27944 IntermediateVT = RegisterVT = MVT::v2f64;
27945 break;
27946 case MVT::bf16:
27947 IntermediateVT = RegisterVT = MVT::v8bf16;
27948 break;
27949 }
27950
27951 return NumRegs;
27952}
27953
27955 const MachineFunction &MF) const {
27956 return !Subtarget->isTargetWindows() &&
27957 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27958}
27959
27960#ifndef NDEBUG
27962 switch (N->getOpcode()) {
27963 default:
27964 break;
27968 case AArch64ISD::UUNPKHI: {
27969 assert(N->getNumValues() == 1 && "Expected one result!");
27970 assert(N->getNumOperands() == 1 && "Expected one operand!");
27971 EVT VT = N->getValueType(0);
27972 EVT OpVT = N->getOperand(0).getValueType();
27973 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
27974 VT.isInteger() && "Expected integer vectors!");
27975 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
27976 "Expected vectors of equal size!");
27977 // TODO: Enable assert once bogus creations have been fixed.
27978 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
27979 // "Expected result vector with half the lanes of its input!");
27980 break;
27981 }
27982 case AArch64ISD::TRN1:
27983 case AArch64ISD::TRN2:
27984 case AArch64ISD::UZP1:
27985 case AArch64ISD::UZP2:
27986 case AArch64ISD::ZIP1:
27987 case AArch64ISD::ZIP2: {
27988 assert(N->getNumValues() == 1 && "Expected one result!");
27989 assert(N->getNumOperands() == 2 && "Expected two operands!");
27990 EVT VT = N->getValueType(0);
27991 EVT Op0VT = N->getOperand(0).getValueType();
27992 EVT Op1VT = N->getOperand(1).getValueType();
27993 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
27994 "Expected vectors!");
27995 // TODO: Enable assert once bogus creations have been fixed.
27996 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
27997 break;
27998 }
27999 }
28000}
28001#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
bool empty() const
Definition: Function.h:809
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1909
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
arg_iterator arg_end()
Definition: Function.h:827
arg_iterator arg_begin()
Definition: Function.h:818
size_t size() const
Definition: Function.h:808
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:677
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1037
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2472
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1212
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2067
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1437
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:476
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2081
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1416
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:692
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:462
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:676
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:662
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:629
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:251
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:477
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1346
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1377
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1162
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1028
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:437
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1362
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1366
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1032
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1376
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:478
@ STRICT_FLOG2
Definition: ISDOpcodes.h:422
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:412
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1407
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:663
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:451
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1359
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1363
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:759
@ STRICT_LROUND
Definition: ISDOpcodes.h:432
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:328
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:587
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:647
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ STRICT_FPOWI
Definition: ISDOpcodes.h:414
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1378
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:628
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1158
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1371
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1023
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1273
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:436
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:425
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:857
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:426
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1336
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1255
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1222
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ STRICT_LRINT
Definition: ISDOpcodes.h:434
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:592
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ STRICT_FROUND
Definition: ISDOpcodes.h:429
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:450
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1379
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:658
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:401
@ STRICT_FLOG10
Definition: ISDOpcodes.h:421
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:435
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:613
@ STRICT_FEXP2
Definition: ISDOpcodes.h:419
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:433
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:856
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1367
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1153
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:581
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:314
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1601
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1492
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1479
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1481
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1471
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:810
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:869
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1509
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:434
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:376
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:291
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64