LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
845
847 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
850 } else {
853 }
856
857 // Generate outline atomics library calls only if LSE was not specified for
858 // subtarget
859 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
885#define LCALLNAMES(A, B, N) \
886 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
887 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
888 setLibcallName(A##N##_REL, #B #N "_rel"); \
889 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
890#define LCALLNAME4(A, B) \
891 LCALLNAMES(A, B, 1) \
892 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
893#define LCALLNAME5(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) \
896 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
897 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
898 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
899 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
900 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
903#undef LCALLNAMES
904#undef LCALLNAME4
905#undef LCALLNAME5
906 }
907
908 if (Subtarget->hasLSE128()) {
909 // Custom lowering because i128 is not legal. Must be replaced by 2x64
910 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
914 }
915
916 // 128-bit loads and stores can be done without expanding
919
920 // Aligned 128-bit loads and stores are single-copy atomic according to the
921 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
922 if (Subtarget->hasLSE2()) {
925 }
926
927 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
928 // custom lowering, as there are no un-paired non-temporal stores and
929 // legalization will break up 256 bit inputs.
931 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
932 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
933 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
938
939 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
940 // custom lowering, as there are no un-paired non-temporal loads legalization
941 // will break up 256 bit inputs.
942 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
943 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
944 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
945 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
946 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
947 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
948 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
949 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
950
951 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
953
954 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
955 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
956 // Issue __sincos_stret if available.
959 } else {
962 }
963
964 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
965 // MSVCRT doesn't have powi; fall back to pow
966 setLibcallName(RTLIB::POWI_F32, nullptr);
967 setLibcallName(RTLIB::POWI_F64, nullptr);
968 }
969
970 // Make floating-point constants legal for the large code model, so they don't
971 // become loads from the constant pool.
972 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
975 }
976
977 // AArch64 does not have floating-point extending loads, i1 sign-extending
978 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
979 for (MVT VT : MVT::fp_valuetypes()) {
980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
981 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
982 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
985 }
986 for (MVT VT : MVT::integer_valuetypes())
988
989 for (MVT WideVT : MVT::fp_valuetypes()) {
990 for (MVT NarrowVT : MVT::fp_valuetypes()) {
991 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
992 setTruncStoreAction(WideVT, NarrowVT, Expand);
993 }
994 }
995 }
996
997 if (Subtarget->hasFPARMv8()) {
1001 }
1002
1003 // Indexed loads and stores are supported.
1004 for (unsigned im = (unsigned)ISD::PRE_INC;
1006 setIndexedLoadAction(im, MVT::i8, Legal);
1007 setIndexedLoadAction(im, MVT::i16, Legal);
1008 setIndexedLoadAction(im, MVT::i32, Legal);
1009 setIndexedLoadAction(im, MVT::i64, Legal);
1010 setIndexedLoadAction(im, MVT::f64, Legal);
1011 setIndexedLoadAction(im, MVT::f32, Legal);
1012 setIndexedLoadAction(im, MVT::f16, Legal);
1013 setIndexedLoadAction(im, MVT::bf16, Legal);
1014 setIndexedStoreAction(im, MVT::i8, Legal);
1015 setIndexedStoreAction(im, MVT::i16, Legal);
1016 setIndexedStoreAction(im, MVT::i32, Legal);
1017 setIndexedStoreAction(im, MVT::i64, Legal);
1018 setIndexedStoreAction(im, MVT::f64, Legal);
1019 setIndexedStoreAction(im, MVT::f32, Legal);
1020 setIndexedStoreAction(im, MVT::f16, Legal);
1021 setIndexedStoreAction(im, MVT::bf16, Legal);
1022 }
1023
1024 // Trap.
1025 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1028
1029 // We combine OR nodes for bitfield operations.
1031 // Try to create BICs for vector ANDs.
1033
1034 // Vector add and sub nodes may conceal a high-half opportunity.
1035 // Also, try to fold ADD into CSINC/CSINV..
1038
1041
1042 // Try and combine setcc with csel
1044
1046
1053
1055
1057
1059
1063
1065
1067
1069
1071
1075
1077
1078 // In case of strict alignment, avoid an excessive number of byte wide stores.
1081 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1082
1086 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1087
1090
1093 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1094
1096
1098
1099 EnableExtLdPromotion = true;
1100
1101 // Set required alignment.
1103 // Set preferred alignments.
1104
1105 // Don't align loops on Windows. The SEH unwind info generation needs to
1106 // know the exact length of functions before the alignments have been
1107 // expanded.
1108 if (!Subtarget->isTargetWindows())
1112
1113 // Only change the limit for entries in a jump table if specified by
1114 // the sub target, but not at the command line.
1115 unsigned MaxJT = STI.getMaximumJumpTableSize();
1116 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1118
1120
1122
1124
1125 if (Subtarget->hasNEON()) {
1126 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1127 // silliness like this:
1128 for (auto Op :
1146 setOperationAction(Op, MVT::v1f64, Expand);
1147
1148 for (auto Op :
1153 setOperationAction(Op, MVT::v1i64, Expand);
1154
1155 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1156 // elements smaller than i32, so promote the input to i32 first.
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1159
1160 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1161 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1162 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1165 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1167
1168 if (Subtarget->hasFullFP16()) {
1171
1180 } else {
1181 // when AArch64 doesn't have fullfp16 support, promote the input
1182 // to i32 first.
1183 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1184 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1185 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1188 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1191 }
1192
1193 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1194 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1201 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1206 }
1207
1208 // Custom handling for some quad-vector types to detect MULL.
1209 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1210 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1211 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1212 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1215
1216 // Saturates
1217 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1218 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1223 }
1224
1225 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1226 MVT::v4i32}) {
1233 }
1234
1235 // Vector reductions
1236 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1237 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1238 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1243
1245 }
1246 }
1247 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1248 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1257 }
1262
1264 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1265 // Likewise, narrowing and extending vector loads/stores aren't handled
1266 // directly.
1269
1270 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1273 } else {
1276 }
1279
1282
1283 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1284 setTruncStoreAction(VT, InnerVT, Expand);
1285 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1286 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1287 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1288 }
1289 }
1290
1291 // AArch64 has implementations of a lot of rounding-like FP operations.
1292 for (auto Op :
1297 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1299 if (Subtarget->hasFullFP16())
1300 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1302 }
1303
1304 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1305
1310
1314
1315 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1317 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1320 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1321
1322 // ADDP custom lowering
1323 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1325 // FADDP custom lowering
1326 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1328 }
1329
1330 if (Subtarget->hasSME()) {
1332 }
1333
1334 // FIXME: Move lowering for more nodes here if those are common between
1335 // SVE and SME.
1336 if (Subtarget->hasSVEorSME()) {
1337 for (auto VT :
1338 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1343 }
1344 }
1345
1346 if (Subtarget->hasSVEorSME()) {
1347 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1390
1396
1405
1410
1411 if (!Subtarget->isLittleEndian())
1413
1414 if (Subtarget->hasSVE2orSME())
1415 // For SLI/SRI.
1417 }
1418
1419 // Illegal unpacked integer vector types.
1420 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1423 }
1424
1425 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1426 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1427 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1429
1430 for (auto VT :
1431 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1432 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1434
1435 for (auto VT :
1436 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1444
1448
1449 // There are no legal MVT::nxv16f## based types.
1450 if (VT != MVT::nxv16i1) {
1453 }
1454 }
1455
1456 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1457 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1458 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1459 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1464 }
1465
1466 // Firstly, exclude all scalable vector extending loads/truncating stores,
1467 // include both integer and floating scalable vector.
1469 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1470 setTruncStoreAction(VT, InnerVT, Expand);
1471 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1472 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1473 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1474 }
1475 }
1476
1477 // Then, selectively enable those which we directly support.
1478 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1479 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1480 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1481 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1482 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1483 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1484 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1485 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1486 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1487 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1488 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1489 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1490 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1491 }
1492
1493 // SVE supports truncating stores of 64 and 128-bit vectors
1494 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1495 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1496 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1497 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1498 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1499
1500 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1501 MVT::nxv4f32, MVT::nxv2f64}) {
1537 if (Subtarget->isSVEAvailable())
1542
1556
1568
1569 if (!Subtarget->isLittleEndian())
1571 }
1572
1573 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1580
1581 if (!Subtarget->isLittleEndian())
1583 }
1584
1587
1588 // NEON doesn't support integer divides, but SVE does
1589 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1590 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1593 }
1594
1595 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1596 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1597 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1598
1599 if (Subtarget->isSVEAvailable()) {
1600 // NEON doesn't support across-vector reductions, but SVE does.
1601 for (auto VT :
1602 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1604 }
1605
1606 if (!Subtarget->isNeonAvailable()) {
1607 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Custom);
1608 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Custom);
1609 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Custom);
1610 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Custom);
1611 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Custom);
1612 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
1613 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
1614 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
1615 setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
1616 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
1617 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
1618 setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
1619 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
1620 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
1621 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1622 MVT::v4i32, MVT::v1i64, MVT::v2i64})
1623 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1624
1625 for (MVT VT :
1626 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1627 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1628 }
1629
1630 // NOTE: Currently this has to happen after computeRegisterProperties rather
1631 // than the preferred option of combining it with the addRegisterClass call.
1632 if (Subtarget->useSVEForFixedLengthVectors()) {
1635 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1638 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1639
1640 // 64bit results can mean a bigger than NEON input.
1641 for (auto VT : {MVT::v8i8, MVT::v4i16})
1644
1645 // 128bit results imply a bigger than NEON input.
1646 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1648 for (auto VT : {MVT::v8f16, MVT::v4f32})
1650
1651 // These operations are not supported on NEON but SVE can do them.
1653 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1654 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1655 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1656 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1657 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1658 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1659 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1660 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1661 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1662 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1663 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1664 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1665 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1666 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1667 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1672
1673 // Int operations with no NEON support.
1674 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1675 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1683 }
1684
1685
1686 // Use SVE for vectors with more than 2 elements.
1687 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1689 }
1690
1691 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1692 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1693 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1694 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1695
1697 }
1698
1699 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1700 // Only required for llvm.aarch64.mops.memset.tag
1702 }
1703
1705
1706 if (Subtarget->hasSVE()) {
1711 }
1712
1713 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1714
1715 IsStrictFPEnabled = true;
1717
1718 if (Subtarget->isWindowsArm64EC()) {
1719 // FIXME: are there intrinsics we need to exclude from this?
1720 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1721 auto code = static_cast<RTLIB::Libcall>(i);
1722 auto libcallName = getLibcallName(code);
1723 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1724 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1725 }
1726 }
1727 }
1728}
1729
1730void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1731 assert(VT.isVector() && "VT should be a vector type");
1732
1733 if (VT.isFloatingPoint()) {
1735 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1736 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1737 }
1738
1739 // Mark vector float intrinsics as expand.
1740 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1750 }
1751
1752 // But we do support custom-lowering for FCOPYSIGN.
1753 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1754 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1755 VT == MVT::v8f16) &&
1756 Subtarget->hasFullFP16()))
1758
1771
1775 for (MVT InnerVT : MVT::all_valuetypes())
1776 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1777
1778 // CNT supports only B element sizes, then use UADDLP to widen.
1779 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1781
1787
1788 for (unsigned Opcode :
1791 setOperationAction(Opcode, VT, Custom);
1792
1793 if (!VT.isFloatingPoint())
1795
1796 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1797 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1798 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1799 setOperationAction(Opcode, VT, Legal);
1800
1801 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1802 // NEON types.
1803 if (VT.isFloatingPoint() &&
1804 VT.getVectorElementType() != MVT::bf16 &&
1805 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1806 for (unsigned Opcode :
1812 setOperationAction(Opcode, VT, Legal);
1813
1814 // Strict fp extend and trunc are legal
1815 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1817 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1819
1820 // FIXME: We could potentially make use of the vector comparison instructions
1821 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1822 // complications:
1823 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1824 // so we would need to expand when the condition code doesn't match the
1825 // kind of comparison.
1826 // * Some kinds of comparison require more than one FCMXY instruction so
1827 // would need to be expanded instead.
1828 // * The lowering of the non-strict versions involves target-specific ISD
1829 // nodes so we would likely need to add strict versions of all of them and
1830 // handle them appropriately.
1833
1834 if (Subtarget->isLittleEndian()) {
1835 for (unsigned im = (unsigned)ISD::PRE_INC;
1839 }
1840 }
1841
1842 if (Subtarget->hasD128()) {
1845 }
1846}
1847
1849 EVT OpVT) const {
1850 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1851 if (!Subtarget->hasSVE())
1852 return true;
1853
1854 // We can only support legal predicate result types. We can use the SVE
1855 // whilelo instruction for generating fixed-width predicates too.
1856 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1857 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1858 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1859 return true;
1860
1861 // The whilelo instruction only works with i32 or i64 scalar inputs.
1862 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1863 return true;
1864
1865 return false;
1866}
1867
1869 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1870}
1871
1872void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
1873 bool StreamingSVE) {
1874 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1875
1876 // By default everything must be expanded.
1877 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1879
1880 if (VT.isFloatingPoint()) {
1890 }
1891
1892 // Mark integer truncating stores/extending loads as having custom lowering
1893 if (VT.isInteger()) {
1894 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1895 while (InnerVT != VT) {
1896 setTruncStoreAction(VT, InnerVT, Custom);
1897 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1898 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1899 InnerVT = InnerVT.changeVectorElementType(
1900 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1901 }
1902 }
1903
1904 // Mark floating-point truncating stores/extending loads as having custom
1905 // lowering
1906 if (VT.isFloatingPoint()) {
1907 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1908 while (InnerVT != VT) {
1909 setTruncStoreAction(VT, InnerVT, Custom);
1910 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1911 InnerVT = InnerVT.changeVectorElementType(
1913 }
1914 }
1915
1916 // Lower fixed length vector operations to scalable equivalents.
1921 setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom);
1956 setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom);
1957 setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom);
1959 setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom);
1978 setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom);
1994 StreamingSVE ? Expand : Custom);
2005}
2006
2007void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
2008 addRegisterClass(VT, &AArch64::FPR64RegClass);
2009 addTypeForNEON(VT);
2010}
2011
2012void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2013 addRegisterClass(VT, &AArch64::FPR128RegClass);
2014 addTypeForNEON(VT);
2015}
2016
2018 LLVMContext &C, EVT VT) const {
2019 if (!VT.isVector())
2020 return MVT::i32;
2021 if (VT.isScalableVector())
2022 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2024}
2025
2026// isIntImmediate - This method tests to see if the node is a constant
2027// operand. If so Imm will receive the value.
2028static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2029 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2030 Imm = C->getZExtValue();
2031 return true;
2032 }
2033 return false;
2034}
2035
2036// isOpcWithIntImmediate - This method tests to see if the node is a specific
2037// opcode and that it has a immediate integer right operand.
2038// If so Imm will receive the value.
2039static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2040 uint64_t &Imm) {
2041 return N->getOpcode() == Opc &&
2042 isIntImmediate(N->getOperand(1).getNode(), Imm);
2043}
2044
2045static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2046 const APInt &Demanded,
2048 unsigned NewOpc) {
2049 uint64_t OldImm = Imm, NewImm, Enc;
2050 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2051
2052 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2053 // bimm64.
2054 if (Imm == 0 || Imm == Mask ||
2056 return false;
2057
2058 unsigned EltSize = Size;
2059 uint64_t DemandedBits = Demanded.getZExtValue();
2060
2061 // Clear bits that are not demanded.
2062 Imm &= DemandedBits;
2063
2064 while (true) {
2065 // The goal here is to set the non-demanded bits in a way that minimizes
2066 // the number of switching between 0 and 1. In order to achieve this goal,
2067 // we set the non-demanded bits to the value of the preceding demanded bits.
2068 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2069 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2070 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2071 // The final result is 0b11000011.
2072 uint64_t NonDemandedBits = ~DemandedBits;
2073 uint64_t InvertedImm = ~Imm & DemandedBits;
2074 uint64_t RotatedImm =
2075 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2076 NonDemandedBits;
2077 uint64_t Sum = RotatedImm + NonDemandedBits;
2078 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2079 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2080 NewImm = (Imm | Ones) & Mask;
2081
2082 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2083 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2084 // we halve the element size and continue the search.
2085 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2086 break;
2087
2088 // We cannot shrink the element size any further if it is 2-bits.
2089 if (EltSize == 2)
2090 return false;
2091
2092 EltSize /= 2;
2093 Mask >>= EltSize;
2094 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2095
2096 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2097 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2098 return false;
2099
2100 // Merge the upper and lower halves of Imm and DemandedBits.
2101 Imm |= Hi;
2102 DemandedBits |= DemandedBitsHi;
2103 }
2104
2105 ++NumOptimizedImms;
2106
2107 // Replicate the element across the register width.
2108 while (EltSize < Size) {
2109 NewImm |= NewImm << EltSize;
2110 EltSize *= 2;
2111 }
2112
2113 (void)OldImm;
2114 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2115 "demanded bits should never be altered");
2116 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2117
2118 // Create the new constant immediate node.
2119 EVT VT = Op.getValueType();
2120 SDLoc DL(Op);
2121 SDValue New;
2122
2123 // If the new constant immediate is all-zeros or all-ones, let the target
2124 // independent DAG combine optimize this node.
2125 if (NewImm == 0 || NewImm == OrigMask) {
2126 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2127 TLO.DAG.getConstant(NewImm, DL, VT));
2128 // Otherwise, create a machine node so that target independent DAG combine
2129 // doesn't undo this optimization.
2130 } else {
2132 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2133 New = SDValue(
2134 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2135 }
2136
2137 return TLO.CombineTo(Op, New);
2138}
2139
2141 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2142 TargetLoweringOpt &TLO) const {
2143 // Delay this optimization to as late as possible.
2144 if (!TLO.LegalOps)
2145 return false;
2146
2148 return false;
2149
2150 EVT VT = Op.getValueType();
2151 if (VT.isVector())
2152 return false;
2153
2154 unsigned Size = VT.getSizeInBits();
2155 assert((Size == 32 || Size == 64) &&
2156 "i32 or i64 is expected after legalization.");
2157
2158 // Exit early if we demand all bits.
2159 if (DemandedBits.popcount() == Size)
2160 return false;
2161
2162 unsigned NewOpc;
2163 switch (Op.getOpcode()) {
2164 default:
2165 return false;
2166 case ISD::AND:
2167 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2168 break;
2169 case ISD::OR:
2170 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2171 break;
2172 case ISD::XOR:
2173 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2174 break;
2175 }
2176 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2177 if (!C)
2178 return false;
2179 uint64_t Imm = C->getZExtValue();
2180 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2181}
2182
2183/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2184/// Mask are known to be either zero or one and return them Known.
2186 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2187 const SelectionDAG &DAG, unsigned Depth) const {
2188 switch (Op.getOpcode()) {
2189 default:
2190 break;
2191 case AArch64ISD::DUP: {
2192 SDValue SrcOp = Op.getOperand(0);
2193 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2194 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2195 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2196 "Expected DUP implicit truncation");
2197 Known = Known.trunc(Op.getScalarValueSizeInBits());
2198 }
2199 break;
2200 }
2201 case AArch64ISD::CSEL: {
2202 KnownBits Known2;
2203 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2204 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2205 Known = Known.intersectWith(Known2);
2206 break;
2207 }
2208 case AArch64ISD::BICi: {
2209 // Compute the bit cleared value.
2210 uint64_t Mask =
2211 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2212 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2213 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2214 break;
2215 }
2216 case AArch64ISD::VLSHR: {
2217 KnownBits Known2;
2218 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2219 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2220 Known = KnownBits::lshr(Known, Known2);
2221 break;
2222 }
2223 case AArch64ISD::VASHR: {
2224 KnownBits Known2;
2225 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2226 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2227 Known = KnownBits::ashr(Known, Known2);
2228 break;
2229 }
2230 case AArch64ISD::VSHL: {
2231 KnownBits Known2;
2232 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2233 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2234 Known = KnownBits::shl(Known, Known2);
2235 break;
2236 }
2237 case AArch64ISD::MOVI: {
2239 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2240 break;
2241 }
2243 case AArch64ISD::ADDlow: {
2244 if (!Subtarget->isTargetILP32())
2245 break;
2246 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2247 Known.Zero = APInt::getHighBitsSet(64, 32);
2248 break;
2249 }
2251 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2252 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2253 break;
2254 }
2256 Intrinsic::ID IntID =
2257 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2258 switch (IntID) {
2259 default: return;
2260 case Intrinsic::aarch64_ldaxr:
2261 case Intrinsic::aarch64_ldxr: {
2262 unsigned BitWidth = Known.getBitWidth();
2263 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2264 unsigned MemBits = VT.getScalarSizeInBits();
2265 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2266 return;
2267 }
2268 }
2269 break;
2270 }
2272 case ISD::INTRINSIC_VOID: {
2273 unsigned IntNo = Op.getConstantOperandVal(0);
2274 switch (IntNo) {
2275 default:
2276 break;
2277 case Intrinsic::aarch64_neon_uaddlv: {
2278 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2279 unsigned BitWidth = Known.getBitWidth();
2280 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2281 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2282 assert(BitWidth >= Bound && "Unexpected width!");
2284 Known.Zero |= Mask;
2285 }
2286 break;
2287 }
2288 case Intrinsic::aarch64_neon_umaxv:
2289 case Intrinsic::aarch64_neon_uminv: {
2290 // Figure out the datatype of the vector operand. The UMINV instruction
2291 // will zero extend the result, so we can mark as known zero all the
2292 // bits larger than the element datatype. 32-bit or larget doesn't need
2293 // this as those are legal types and will be handled by isel directly.
2294 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2295 unsigned BitWidth = Known.getBitWidth();
2296 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2297 assert(BitWidth >= 8 && "Unexpected width!");
2299 Known.Zero |= Mask;
2300 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2301 assert(BitWidth >= 16 && "Unexpected width!");
2303 Known.Zero |= Mask;
2304 }
2305 break;
2306 } break;
2307 }
2308 }
2309 }
2310}
2311
2313 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2314 unsigned Depth) const {
2315 EVT VT = Op.getValueType();
2316 unsigned VTBits = VT.getScalarSizeInBits();
2317 unsigned Opcode = Op.getOpcode();
2318 switch (Opcode) {
2319 case AArch64ISD::CMEQ:
2320 case AArch64ISD::CMGE:
2321 case AArch64ISD::CMGT:
2322 case AArch64ISD::CMHI:
2323 case AArch64ISD::CMHS:
2324 case AArch64ISD::FCMEQ:
2325 case AArch64ISD::FCMGE:
2326 case AArch64ISD::FCMGT:
2327 case AArch64ISD::CMEQz:
2328 case AArch64ISD::CMGEz:
2329 case AArch64ISD::CMGTz:
2330 case AArch64ISD::CMLEz:
2331 case AArch64ISD::CMLTz:
2332 case AArch64ISD::FCMEQz:
2333 case AArch64ISD::FCMGEz:
2334 case AArch64ISD::FCMGTz:
2335 case AArch64ISD::FCMLEz:
2336 case AArch64ISD::FCMLTz:
2337 // Compares return either 0 or all-ones
2338 return VTBits;
2339 }
2340
2341 return 1;
2342}
2343
2345 EVT) const {
2346 return MVT::i64;
2347}
2348
2350 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2351 unsigned *Fast) const {
2352 if (Subtarget->requiresStrictAlign())
2353 return false;
2354
2355 if (Fast) {
2356 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2357 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2358 // See comments in performSTORECombine() for more details about
2359 // these conditions.
2360
2361 // Code that uses clang vector extensions can mark that it
2362 // wants unaligned accesses to be treated as fast by
2363 // underspecifying alignment to be 1 or 2.
2364 Alignment <= 2 ||
2365
2366 // Disregard v2i64. Memcpy lowering produces those and splitting
2367 // them regresses performance on micro-benchmarks and olden/bh.
2368 VT == MVT::v2i64;
2369 }
2370 return true;
2371}
2372
2373// Same as above but handling LLTs instead.
2375 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2376 unsigned *Fast) const {
2377 if (Subtarget->requiresStrictAlign())
2378 return false;
2379
2380 if (Fast) {
2381 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2382 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2383 Ty.getSizeInBytes() != 16 ||
2384 // See comments in performSTORECombine() for more details about
2385 // these conditions.
2386
2387 // Code that uses clang vector extensions can mark that it
2388 // wants unaligned accesses to be treated as fast by
2389 // underspecifying alignment to be 1 or 2.
2390 Alignment <= 2 ||
2391
2392 // Disregard v2i64. Memcpy lowering produces those and splitting
2393 // them regresses performance on micro-benchmarks and olden/bh.
2394 Ty == LLT::fixed_vector(2, 64);
2395 }
2396 return true;
2397}
2398
2399FastISel *
2401 const TargetLibraryInfo *libInfo) const {
2402 return AArch64::createFastISel(funcInfo, libInfo);
2403}
2404
2405const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2406#define MAKE_CASE(V) \
2407 case V: \
2408 return #V;
2409 switch ((AArch64ISD::NodeType)Opcode) {
2411 break;
2728 }
2729#undef MAKE_CASE
2730 return nullptr;
2731}
2732
2735 MachineBasicBlock *MBB) const {
2736 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2737 // phi node:
2738
2739 // OrigBB:
2740 // [... previous instrs leading to comparison ...]
2741 // b.ne TrueBB
2742 // b EndBB
2743 // TrueBB:
2744 // ; Fallthrough
2745 // EndBB:
2746 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2747
2748 MachineFunction *MF = MBB->getParent();
2749 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2750 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2751 DebugLoc DL = MI.getDebugLoc();
2753
2754 Register DestReg = MI.getOperand(0).getReg();
2755 Register IfTrueReg = MI.getOperand(1).getReg();
2756 Register IfFalseReg = MI.getOperand(2).getReg();
2757 unsigned CondCode = MI.getOperand(3).getImm();
2758 bool NZCVKilled = MI.getOperand(4).isKill();
2759
2760 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2761 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2762 MF->insert(It, TrueBB);
2763 MF->insert(It, EndBB);
2764
2765 // Transfer rest of current basic-block to EndBB
2766 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2767 MBB->end());
2769
2770 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2771 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2772 MBB->addSuccessor(TrueBB);
2773 MBB->addSuccessor(EndBB);
2774
2775 // TrueBB falls through to the end.
2776 TrueBB->addSuccessor(EndBB);
2777
2778 if (!NZCVKilled) {
2779 TrueBB->addLiveIn(AArch64::NZCV);
2780 EndBB->addLiveIn(AArch64::NZCV);
2781 }
2782
2783 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2784 .addReg(IfTrueReg)
2785 .addMBB(TrueBB)
2786 .addReg(IfFalseReg)
2787 .addMBB(MBB);
2788
2789 MI.eraseFromParent();
2790 return EndBB;
2791}
2792
2794 MachineInstr &MI, MachineBasicBlock *BB) const {
2796 BB->getParent()->getFunction().getPersonalityFn())) &&
2797 "SEH does not use catchret!");
2798 return BB;
2799}
2800
2803 MachineBasicBlock *MBB) const {
2804 MachineFunction &MF = *MBB->getParent();
2805 MachineBasicBlock::iterator MBBI = MI.getIterator();
2807 const AArch64InstrInfo &TII =
2808 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2809 Register TargetReg = MI.getOperand(0).getReg();
2811 TII.probedStackAlloc(MBBI, TargetReg, false);
2812
2813 MI.eraseFromParent();
2814 return NextInst->getParent();
2815}
2816
2818AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2820 MachineBasicBlock *BB) const {
2821 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2822 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2823
2824 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2825 MIB.add(MI.getOperand(1)); // slice index register
2826 MIB.add(MI.getOperand(2)); // slice index offset
2827 MIB.add(MI.getOperand(3)); // pg
2828 MIB.add(MI.getOperand(4)); // base
2829 MIB.add(MI.getOperand(5)); // offset
2830
2831 MI.eraseFromParent(); // The pseudo is gone now.
2832 return BB;
2833}
2834
2837 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2839 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2840
2841 MIB.addReg(AArch64::ZA, RegState::Define);
2842 MIB.add(MI.getOperand(0)); // Vector select register
2843 MIB.add(MI.getOperand(1)); // Vector select offset
2844 MIB.add(MI.getOperand(2)); // Base
2845 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2846
2847 MI.eraseFromParent(); // The pseudo is gone now.
2848 return BB;
2849}
2850
2853 unsigned Opcode,
2854 bool Op0IsDef) const {
2855 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2857
2858 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2859 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2860 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2861 MIB.add(MI.getOperand(I));
2862
2863 MI.eraseFromParent(); // The pseudo is gone now.
2864 return BB;
2865}
2866
2868AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2870 MachineBasicBlock *BB, bool HasTile) const {
2871 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2872 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2873 unsigned StartIdx = 0;
2874
2875 if (HasTile) {
2876 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2877 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2878 StartIdx = 1;
2879 } else
2880 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2881
2882 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2883 MIB.add(MI.getOperand(I));
2884
2885 MI.eraseFromParent(); // The pseudo is gone now.
2886 return BB;
2887}
2888
2891 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2893 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2894 MIB.add(MI.getOperand(0)); // Mask
2895
2896 unsigned Mask = MI.getOperand(0).getImm();
2897 for (unsigned I = 0; I < 8; I++) {
2898 if (Mask & (1 << I))
2899 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2900 }
2901
2902 MI.eraseFromParent(); // The pseudo is gone now.
2903 return BB;
2904}
2905
2907 MachineInstr &MI, MachineBasicBlock *BB) const {
2908
2909 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2910 if (SMEOrigInstr != -1) {
2911 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2912 uint64_t SMEMatrixType =
2913 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2914 switch (SMEMatrixType) {
2916 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2918 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2920 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2922 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2924 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2926 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2927 }
2928 }
2929
2930 switch (MI.getOpcode()) {
2931 default:
2932#ifndef NDEBUG
2933 MI.dump();
2934#endif
2935 llvm_unreachable("Unexpected instruction for custom inserter!");
2936
2937 case AArch64::F128CSEL:
2938 return EmitF128CSEL(MI, BB);
2939 case TargetOpcode::STATEPOINT:
2940 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2941 // while bl call instruction (where statepoint will be lowered at the end)
2942 // has implicit def. This def is early-clobber as it will be set at
2943 // the moment of the call and earlier than any use is read.
2944 // Add this implicit dead def here as a workaround.
2945 MI.addOperand(*MI.getMF(),
2947 AArch64::LR, /*isDef*/ true,
2948 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2949 /*isUndef*/ false, /*isEarlyClobber*/ true));
2950 [[fallthrough]];
2951 case TargetOpcode::STACKMAP:
2952 case TargetOpcode::PATCHPOINT:
2953 return emitPatchPoint(MI, BB);
2954
2955 case TargetOpcode::PATCHABLE_EVENT_CALL:
2956 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2957 return BB;
2958
2959 case AArch64::CATCHRET:
2960 return EmitLoweredCatchRet(MI, BB);
2961
2962 case AArch64::PROBED_STACKALLOC_DYN:
2963 return EmitDynamicProbedAlloc(MI, BB);
2964
2965 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2966 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2967 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2968 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2969 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2970 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2971 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2972 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2973 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2974 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2975 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2976 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2977 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2978 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2979 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2980 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2981 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2982 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2983 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2984 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2985 case AArch64::LDR_ZA_PSEUDO:
2986 return EmitFill(MI, BB);
2987 case AArch64::LDR_TX_PSEUDO:
2988 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2989 case AArch64::STR_TX_PSEUDO:
2990 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2991 case AArch64::ZERO_M_PSEUDO:
2992 return EmitZero(MI, BB);
2993 case AArch64::ZERO_T_PSEUDO:
2994 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2995 }
2996}
2997
2998//===----------------------------------------------------------------------===//
2999// AArch64 Lowering private implementation.
3000//===----------------------------------------------------------------------===//
3001
3002//===----------------------------------------------------------------------===//
3003// Lowering Code
3004//===----------------------------------------------------------------------===//
3005
3006// Forward declarations of SVE fixed length lowering helpers
3011 SelectionDAG &DAG);
3014 EVT VT);
3015
3016/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3017static bool isZerosVector(const SDNode *N) {
3018 // Look through a bit convert.
3019 while (N->getOpcode() == ISD::BITCAST)
3020 N = N->getOperand(0).getNode();
3021
3023 return true;
3024
3025 if (N->getOpcode() != AArch64ISD::DUP)
3026 return false;
3027
3028 auto Opnd0 = N->getOperand(0);
3029 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3030}
3031
3032/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3033/// CC
3035 switch (CC) {
3036 default:
3037 llvm_unreachable("Unknown condition code!");
3038 case ISD::SETNE:
3039 return AArch64CC::NE;
3040 case ISD::SETEQ:
3041 return AArch64CC::EQ;
3042 case ISD::SETGT:
3043 return AArch64CC::GT;
3044 case ISD::SETGE:
3045 return AArch64CC::GE;
3046 case ISD::SETLT:
3047 return AArch64CC::LT;
3048 case ISD::SETLE:
3049 return AArch64CC::LE;
3050 case ISD::SETUGT:
3051 return AArch64CC::HI;
3052 case ISD::SETUGE:
3053 return AArch64CC::HS;
3054 case ISD::SETULT:
3055 return AArch64CC::LO;
3056 case ISD::SETULE:
3057 return AArch64CC::LS;
3058 }
3059}
3060
3061/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3063 AArch64CC::CondCode &CondCode,
3064 AArch64CC::CondCode &CondCode2) {
3065 CondCode2 = AArch64CC::AL;
3066 switch (CC) {
3067 default:
3068 llvm_unreachable("Unknown FP condition!");
3069 case ISD::SETEQ:
3070 case ISD::SETOEQ:
3071 CondCode = AArch64CC::EQ;
3072 break;
3073 case ISD::SETGT:
3074 case ISD::SETOGT:
3075 CondCode = AArch64CC::GT;
3076 break;
3077 case ISD::SETGE:
3078 case ISD::SETOGE:
3079 CondCode = AArch64CC::GE;
3080 break;
3081 case ISD::SETOLT:
3082 CondCode = AArch64CC::MI;
3083 break;
3084 case ISD::SETOLE:
3085 CondCode = AArch64CC::LS;
3086 break;
3087 case ISD::SETONE:
3088 CondCode = AArch64CC::MI;
3089 CondCode2 = AArch64CC::GT;
3090 break;
3091 case ISD::SETO:
3092 CondCode = AArch64CC::VC;
3093 break;
3094 case ISD::SETUO:
3095 CondCode = AArch64CC::VS;
3096 break;
3097 case ISD::SETUEQ:
3098 CondCode = AArch64CC::EQ;
3099 CondCode2 = AArch64CC::VS;
3100 break;
3101 case ISD::SETUGT:
3102 CondCode = AArch64CC::HI;
3103 break;
3104 case ISD::SETUGE:
3105 CondCode = AArch64CC::PL;
3106 break;
3107 case ISD::SETLT:
3108 case ISD::SETULT:
3109 CondCode = AArch64CC::LT;
3110 break;
3111 case ISD::SETLE:
3112 case ISD::SETULE:
3113 CondCode = AArch64CC::LE;
3114 break;
3115 case ISD::SETNE:
3116 case ISD::SETUNE:
3117 CondCode = AArch64CC::NE;
3118 break;
3119 }
3120}
3121
3122/// Convert a DAG fp condition code to an AArch64 CC.
3123/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3124/// should be AND'ed instead of OR'ed.
3126 AArch64CC::CondCode &CondCode,
3127 AArch64CC::CondCode &CondCode2) {
3128 CondCode2 = AArch64CC::AL;
3129 switch (CC) {
3130 default:
3131 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3132 assert(CondCode2 == AArch64CC::AL);
3133 break;
3134 case ISD::SETONE:
3135 // (a one b)
3136 // == ((a olt b) || (a ogt b))
3137 // == ((a ord b) && (a une b))
3138 CondCode = AArch64CC::VC;
3139 CondCode2 = AArch64CC::NE;
3140 break;
3141 case ISD::SETUEQ:
3142 // (a ueq b)
3143 // == ((a uno b) || (a oeq b))
3144 // == ((a ule b) && (a uge b))
3145 CondCode = AArch64CC::PL;
3146 CondCode2 = AArch64CC::LE;
3147 break;
3148 }
3149}
3150
3151/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3152/// CC usable with the vector instructions. Fewer operations are available
3153/// without a real NZCV register, so we have to use less efficient combinations
3154/// to get the same effect.
3156 AArch64CC::CondCode &CondCode,
3157 AArch64CC::CondCode &CondCode2,
3158 bool &Invert) {
3159 Invert = false;
3160 switch (CC) {
3161 default:
3162 // Mostly the scalar mappings work fine.
3163 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3164 break;
3165 case ISD::SETUO:
3166 Invert = true;
3167 [[fallthrough]];
3168 case ISD::SETO:
3169 CondCode = AArch64CC::MI;
3170 CondCode2 = AArch64CC::GE;
3171 break;
3172 case ISD::SETUEQ:
3173 case ISD::SETULT:
3174 case ISD::SETULE:
3175 case ISD::SETUGT:
3176 case ISD::SETUGE:
3177 // All of the compare-mask comparisons are ordered, but we can switch
3178 // between the two by a double inversion. E.g. ULE == !OGT.
3179 Invert = true;
3180 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3181 CondCode, CondCode2);
3182 break;
3183 }
3184}
3185
3187 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3188 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3189 LLVM_DEBUG(dbgs() << "Is imm " << C
3190 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3191 return IsLegal;
3192}
3193
3194// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3195// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3196// can be set differently by this operation. It comes down to whether
3197// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3198// everything is fine. If not then the optimization is wrong. Thus general
3199// comparisons are only valid if op2 != 0.
3200//
3201// So, finally, the only LLVM-native comparisons that don't mention C and V
3202// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3203// the absence of information about op2.
3205 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3206 (CC == ISD::SETEQ || CC == ISD::SETNE);
3207}
3208
3210 SelectionDAG &DAG, SDValue Chain,
3211 bool IsSignaling) {
3212 EVT VT = LHS.getValueType();
3213 assert(VT != MVT::f128);
3214
3215 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3216
3217 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3218 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3219 {Chain, LHS});
3220 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3221 {LHS.getValue(1), RHS});
3222 Chain = RHS.getValue(1);
3223 VT = MVT::f32;
3224 }
3225 unsigned Opcode =
3227 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3228}
3229
3231 const SDLoc &dl, SelectionDAG &DAG) {
3232 EVT VT = LHS.getValueType();
3233 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3234
3235 if (VT.isFloatingPoint()) {
3236 assert(VT != MVT::f128);
3237 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3238 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3239 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3240 VT = MVT::f32;
3241 }
3242 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3243 }
3244
3245 // The CMP instruction is just an alias for SUBS, and representing it as
3246 // SUBS means that it's possible to get CSE with subtract operations.
3247 // A later phase can perform the optimization of setting the destination
3248 // register to WZR/XZR if it ends up being unused.
3249 unsigned Opcode = AArch64ISD::SUBS;
3250
3251 if (isCMN(RHS, CC)) {
3252 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3253 Opcode = AArch64ISD::ADDS;
3254 RHS = RHS.getOperand(1);
3255 } else if (isCMN(LHS, CC)) {
3256 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3257 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3258 Opcode = AArch64ISD::ADDS;
3259 LHS = LHS.getOperand(1);
3260 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3261 if (LHS.getOpcode() == ISD::AND) {
3262 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3263 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3264 // of the signed comparisons.
3265 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3266 DAG.getVTList(VT, MVT_CC),
3267 LHS.getOperand(0),
3268 LHS.getOperand(1));
3269 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3270 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3271 return ANDSNode.getValue(1);
3272 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3273 // Use result of ANDS
3274 return LHS.getValue(1);
3275 }
3276 }
3277
3278 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3279 .getValue(1);
3280}
3281
3282/// \defgroup AArch64CCMP CMP;CCMP matching
3283///
3284/// These functions deal with the formation of CMP;CCMP;... sequences.
3285/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3286/// a comparison. They set the NZCV flags to a predefined value if their
3287/// predicate is false. This allows to express arbitrary conjunctions, for
3288/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3289/// expressed as:
3290/// cmp A
3291/// ccmp B, inv(CB), CA
3292/// check for CB flags
3293///
3294/// This naturally lets us implement chains of AND operations with SETCC
3295/// operands. And we can even implement some other situations by transforming
3296/// them:
3297/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3298/// negating the flags used in a CCMP/FCCMP operations.
3299/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3300/// by negating the flags we test for afterwards. i.e.
3301/// NEG (CMP CCMP CCCMP ...) can be implemented.
3302/// - Note that we can only ever negate all previously processed results.
3303/// What we can not implement by flipping the flags to test is a negation
3304/// of two sub-trees (because the negation affects all sub-trees emitted so
3305/// far, so the 2nd sub-tree we emit would also affect the first).
3306/// With those tools we can implement some OR operations:
3307/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3308/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3309/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3310/// elimination rules from earlier to implement the whole thing as a
3311/// CCMP/FCCMP chain.
3312///
3313/// As complete example:
3314/// or (or (setCA (cmp A)) (setCB (cmp B)))
3315/// (and (setCC (cmp C)) (setCD (cmp D)))"
3316/// can be reassociated to:
3317/// or (and (setCC (cmp C)) setCD (cmp D))
3318// (or (setCA (cmp A)) (setCB (cmp B)))
3319/// can be transformed to:
3320/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3321/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3322/// which can be implemented as:
3323/// cmp C
3324/// ccmp D, inv(CD), CC
3325/// ccmp A, CA, inv(CD)
3326/// ccmp B, CB, inv(CA)
3327/// check for CB flags
3328///
3329/// A counterexample is "or (and A B) (and C D)" which translates to
3330/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3331/// can only implement 1 of the inner (not) operations, but not both!
3332/// @{
3333
3334/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3336 ISD::CondCode CC, SDValue CCOp,
3337 AArch64CC::CondCode Predicate,
3338 AArch64CC::CondCode OutCC,
3339 const SDLoc &DL, SelectionDAG &DAG) {
3340 unsigned Opcode = 0;
3341 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3342
3343 if (LHS.getValueType().isFloatingPoint()) {
3344 assert(LHS.getValueType() != MVT::f128);
3345 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3346 LHS.getValueType() == MVT::bf16) {
3347 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3348 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3349 }
3350 Opcode = AArch64ISD::FCCMP;
3351 } else if (RHS.getOpcode() == ISD::SUB) {
3352 SDValue SubOp0 = RHS.getOperand(0);
3353 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3354 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3355 Opcode = AArch64ISD::CCMN;
3356 RHS = RHS.getOperand(1);
3357 }
3358 }
3359 if (Opcode == 0)
3360 Opcode = AArch64ISD::CCMP;
3361
3362 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3364 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3365 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3366 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3367}
3368
3369/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3370/// expressed as a conjunction. See \ref AArch64CCMP.
3371/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3372/// changing the conditions on the SETCC tests.
3373/// (this means we can call emitConjunctionRec() with
3374/// Negate==true on this sub-tree)
3375/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3376/// cannot do the negation naturally. We are required to
3377/// emit the subtree first in this case.
3378/// \param WillNegate Is true if are called when the result of this
3379/// subexpression must be negated. This happens when the
3380/// outer expression is an OR. We can use this fact to know
3381/// that we have a double negation (or (or ...) ...) that
3382/// can be implemented for free.
3383static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3384 bool &MustBeFirst, bool WillNegate,
3385 unsigned Depth = 0) {
3386 if (!Val.hasOneUse())
3387 return false;
3388 unsigned Opcode = Val->getOpcode();
3389 if (Opcode == ISD::SETCC) {
3390 if (Val->getOperand(0).getValueType() == MVT::f128)
3391 return false;
3392 CanNegate = true;
3393 MustBeFirst = false;
3394 return true;
3395 }
3396 // Protect against exponential runtime and stack overflow.
3397 if (Depth > 6)
3398 return false;
3399 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3400 bool IsOR = Opcode == ISD::OR;
3401 SDValue O0 = Val->getOperand(0);
3402 SDValue O1 = Val->getOperand(1);
3403 bool CanNegateL;
3404 bool MustBeFirstL;
3405 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3406 return false;
3407 bool CanNegateR;
3408 bool MustBeFirstR;
3409 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3410 return false;
3411
3412 if (MustBeFirstL && MustBeFirstR)
3413 return false;
3414
3415 if (IsOR) {
3416 // For an OR expression we need to be able to naturally negate at least
3417 // one side or we cannot do the transformation at all.
3418 if (!CanNegateL && !CanNegateR)
3419 return false;
3420 // If we the result of the OR will be negated and we can naturally negate
3421 // the leafs, then this sub-tree as a whole negates naturally.
3422 CanNegate = WillNegate && CanNegateL && CanNegateR;
3423 // If we cannot naturally negate the whole sub-tree, then this must be
3424 // emitted first.
3425 MustBeFirst = !CanNegate;
3426 } else {
3427 assert(Opcode == ISD::AND && "Must be OR or AND");
3428 // We cannot naturally negate an AND operation.
3429 CanNegate = false;
3430 MustBeFirst = MustBeFirstL || MustBeFirstR;
3431 }
3432 return true;
3433 }
3434 return false;
3435}
3436
3437/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3438/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3439/// Tries to transform the given i1 producing node @p Val to a series compare
3440/// and conditional compare operations. @returns an NZCV flags producing node
3441/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3442/// transformation was not possible.
3443/// \p Negate is true if we want this sub-tree being negated just by changing
3444/// SETCC conditions.
3446 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3447 AArch64CC::CondCode Predicate) {
3448 // We're at a tree leaf, produce a conditional comparison operation.
3449 unsigned Opcode = Val->getOpcode();
3450 if (Opcode == ISD::SETCC) {
3451 SDValue LHS = Val->getOperand(0);
3452 SDValue RHS = Val->getOperand(1);
3453 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3454 bool isInteger = LHS.getValueType().isInteger();
3455 if (Negate)
3456 CC = getSetCCInverse(CC, LHS.getValueType());
3457 SDLoc DL(Val);
3458 // Determine OutCC and handle FP special case.
3459 if (isInteger) {
3460 OutCC = changeIntCCToAArch64CC(CC);
3461 } else {
3462 assert(LHS.getValueType().isFloatingPoint());
3463 AArch64CC::CondCode ExtraCC;
3464 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3465 // Some floating point conditions can't be tested with a single condition
3466 // code. Construct an additional comparison in this case.
3467 if (ExtraCC != AArch64CC::AL) {
3468 SDValue ExtraCmp;
3469 if (!CCOp.getNode())
3470 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3471 else
3472 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3473 ExtraCC, DL, DAG);
3474 CCOp = ExtraCmp;
3475 Predicate = ExtraCC;
3476 }
3477 }
3478
3479 // Produce a normal comparison if we are first in the chain
3480 if (!CCOp)
3481 return emitComparison(LHS, RHS, CC, DL, DAG);
3482 // Otherwise produce a ccmp.
3483 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3484 DAG);
3485 }
3486 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3487
3488 bool IsOR = Opcode == ISD::OR;
3489
3490 SDValue LHS = Val->getOperand(0);
3491 bool CanNegateL;
3492 bool MustBeFirstL;
3493 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3494 assert(ValidL && "Valid conjunction/disjunction tree");
3495 (void)ValidL;
3496
3497 SDValue RHS = Val->getOperand(1);
3498 bool CanNegateR;
3499 bool MustBeFirstR;
3500 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3501 assert(ValidR && "Valid conjunction/disjunction tree");
3502 (void)ValidR;
3503
3504 // Swap sub-tree that must come first to the right side.
3505 if (MustBeFirstL) {
3506 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3507 std::swap(LHS, RHS);
3508 std::swap(CanNegateL, CanNegateR);
3509 std::swap(MustBeFirstL, MustBeFirstR);
3510 }
3511
3512 bool NegateR;
3513 bool NegateAfterR;
3514 bool NegateL;
3515 bool NegateAfterAll;
3516 if (Opcode == ISD::OR) {
3517 // Swap the sub-tree that we can negate naturally to the left.
3518 if (!CanNegateL) {
3519 assert(CanNegateR && "at least one side must be negatable");
3520 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3521 assert(!Negate);
3522 std::swap(LHS, RHS);
3523 NegateR = false;
3524 NegateAfterR = true;
3525 } else {
3526 // Negate the left sub-tree if possible, otherwise negate the result.
3527 NegateR = CanNegateR;
3528 NegateAfterR = !CanNegateR;
3529 }
3530 NegateL = true;
3531 NegateAfterAll = !Negate;
3532 } else {
3533 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3534 assert(!Negate && "Valid conjunction/disjunction tree");
3535
3536 NegateL = false;
3537 NegateR = false;
3538 NegateAfterR = false;
3539 NegateAfterAll = false;
3540 }
3541
3542 // Emit sub-trees.
3543 AArch64CC::CondCode RHSCC;
3544 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3545 if (NegateAfterR)
3546 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3547 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3548 if (NegateAfterAll)
3549 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3550 return CmpL;
3551}
3552
3553/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3554/// In some cases this is even possible with OR operations in the expression.
3555/// See \ref AArch64CCMP.
3556/// \see emitConjunctionRec().
3558 AArch64CC::CondCode &OutCC) {
3559 bool DummyCanNegate;
3560 bool DummyMustBeFirst;
3561 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3562 return SDValue();
3563
3564 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3565}
3566
3567/// @}
3568
3569/// Returns how profitable it is to fold a comparison's operand's shift and/or
3570/// extension operations.
3572 auto isSupportedExtend = [&](SDValue V) {
3573 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3574 return true;
3575
3576 if (V.getOpcode() == ISD::AND)
3577 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3578 uint64_t Mask = MaskCst->getZExtValue();
3579 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3580 }
3581
3582 return false;
3583 };
3584
3585 if (!Op.hasOneUse())
3586 return 0;
3587
3588 if (isSupportedExtend(Op))
3589 return 1;
3590
3591 unsigned Opc = Op.getOpcode();
3592 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3593 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3594 uint64_t Shift = ShiftCst->getZExtValue();
3595 if (isSupportedExtend(Op.getOperand(0)))
3596 return (Shift <= 4) ? 2 : 1;
3597 EVT VT = Op.getValueType();
3598 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3599 return 1;
3600 }
3601
3602 return 0;
3603}
3604
3606 SDValue &AArch64cc, SelectionDAG &DAG,
3607 const SDLoc &dl) {
3608 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3609 EVT VT = RHS.getValueType();
3610 uint64_t C = RHSC->getZExtValue();
3611 if (!isLegalArithImmed(C)) {
3612 // Constant does not fit, try adjusting it by one?
3613 switch (CC) {
3614 default:
3615 break;
3616 case ISD::SETLT:
3617 case ISD::SETGE:
3618 if ((VT == MVT::i32 && C != 0x80000000 &&
3619 isLegalArithImmed((uint32_t)(C - 1))) ||
3620 (VT == MVT::i64 && C != 0x80000000ULL &&
3621 isLegalArithImmed(C - 1ULL))) {
3623 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3624 RHS = DAG.getConstant(C, dl, VT);
3625 }
3626 break;
3627 case ISD::SETULT:
3628 case ISD::SETUGE:
3629 if ((VT == MVT::i32 && C != 0 &&
3630 isLegalArithImmed((uint32_t)(C - 1))) ||
3631 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3633 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3634 RHS = DAG.getConstant(C, dl, VT);
3635 }
3636 break;
3637 case ISD::SETLE:
3638 case ISD::SETGT:
3639 if ((VT == MVT::i32 && C != INT32_MAX &&
3640 isLegalArithImmed((uint32_t)(C + 1))) ||
3641 (VT == MVT::i64 && C != INT64_MAX &&
3642 isLegalArithImmed(C + 1ULL))) {
3644 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3645 RHS = DAG.getConstant(C, dl, VT);
3646 }
3647 break;
3648 case ISD::SETULE:
3649 case ISD::SETUGT:
3650 if ((VT == MVT::i32 && C != UINT32_MAX &&
3651 isLegalArithImmed((uint32_t)(C + 1))) ||
3652 (VT == MVT::i64 && C != UINT64_MAX &&
3653 isLegalArithImmed(C + 1ULL))) {
3655 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3656 RHS = DAG.getConstant(C, dl, VT);
3657 }
3658 break;
3659 }
3660 }
3661 }
3662
3663 // Comparisons are canonicalized so that the RHS operand is simpler than the
3664 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3665 // can fold some shift+extend operations on the RHS operand, so swap the
3666 // operands if that can be done.
3667 //
3668 // For example:
3669 // lsl w13, w11, #1
3670 // cmp w13, w12
3671 // can be turned into:
3672 // cmp w12, w11, lsl #1
3673 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3674 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3675
3677 std::swap(LHS, RHS);
3679 }
3680 }
3681
3682 SDValue Cmp;
3683 AArch64CC::CondCode AArch64CC;
3684 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3685 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3686
3687 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3688 // For the i8 operand, the largest immediate is 255, so this can be easily
3689 // encoded in the compare instruction. For the i16 operand, however, the
3690 // largest immediate cannot be encoded in the compare.
3691 // Therefore, use a sign extending load and cmn to avoid materializing the
3692 // -1 constant. For example,
3693 // movz w1, #65535
3694 // ldrh w0, [x0, #0]
3695 // cmp w0, w1
3696 // >
3697 // ldrsh w0, [x0, #0]
3698 // cmn w0, #1
3699 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3700 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3701 // ensure both the LHS and RHS are truly zero extended and to make sure the
3702 // transformation is profitable.
3703 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3704 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3705 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3706 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3707 int16_t ValueofRHS = RHS->getAsZExtVal();
3708 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3709 SDValue SExt =
3710 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3711 DAG.getValueType(MVT::i16));
3712 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3713 RHS.getValueType()),
3714 CC, dl, DAG);
3715 AArch64CC = changeIntCCToAArch64CC(CC);
3716 }
3717 }
3718
3719 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3720 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3721 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3722 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3723 }
3724 }
3725 }
3726
3727 if (!Cmp) {
3728 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3729 AArch64CC = changeIntCCToAArch64CC(CC);
3730 }
3731 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3732 return Cmp;
3733}
3734
3735static std::pair<SDValue, SDValue>
3737 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3738 "Unsupported value type");
3739 SDValue Value, Overflow;
3740 SDLoc DL(Op);
3741 SDValue LHS = Op.getOperand(0);
3742 SDValue RHS = Op.getOperand(1);
3743 unsigned Opc = 0;
3744 switch (Op.getOpcode()) {
3745 default:
3746 llvm_unreachable("Unknown overflow instruction!");
3747 case ISD::SADDO:
3748 Opc = AArch64ISD::ADDS;
3749 CC = AArch64CC::VS;
3750 break;
3751 case ISD::UADDO:
3752 Opc = AArch64ISD::ADDS;
3753 CC = AArch64CC::HS;
3754 break;
3755 case ISD::SSUBO:
3756 Opc = AArch64ISD::SUBS;
3757 CC = AArch64CC::VS;
3758 break;
3759 case ISD::USUBO:
3760 Opc = AArch64ISD::SUBS;
3761 CC = AArch64CC::LO;
3762 break;
3763 // Multiply needs a little bit extra work.
3764 case ISD::SMULO:
3765 case ISD::UMULO: {
3766 CC = AArch64CC::NE;
3767 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3768 if (Op.getValueType() == MVT::i32) {
3769 // Extend to 64-bits, then perform a 64-bit multiply.
3770 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3771 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3772 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3773 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3774 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3775
3776 // Check that the result fits into a 32-bit integer.
3777 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3778 if (IsSigned) {
3779 // cmp xreg, wreg, sxtw
3780 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3781 Overflow =
3782 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3783 } else {
3784 // tst xreg, #0xffffffff00000000
3785 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3786 Overflow =
3787 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3788 }
3789 break;
3790 }
3791 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3792 // For the 64 bit multiply
3793 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3794 if (IsSigned) {
3795 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3796 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3797 DAG.getConstant(63, DL, MVT::i64));
3798 // It is important that LowerBits is last, otherwise the arithmetic
3799 // shift will not be folded into the compare (SUBS).
3800 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3801 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3802 .getValue(1);
3803 } else {
3804 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3805 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3806 Overflow =
3807 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3808 DAG.getConstant(0, DL, MVT::i64),
3809 UpperBits).getValue(1);
3810 }
3811 break;
3812 }
3813 } // switch (...)
3814
3815 if (Opc) {
3816 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3817
3818 // Emit the AArch64 operation with overflow check.
3819 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3820 Overflow = Value.getValue(1);
3821 }
3822 return std::make_pair(Value, Overflow);
3823}
3824
3825SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3826 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3827 !Subtarget->isNeonAvailable()))
3828 return LowerToScalableOp(Op, DAG);
3829
3830 SDValue Sel = Op.getOperand(0);
3831 SDValue Other = Op.getOperand(1);
3832 SDLoc dl(Sel);
3833
3834 // If the operand is an overflow checking operation, invert the condition
3835 // code and kill the Not operation. I.e., transform:
3836 // (xor (overflow_op_bool, 1))
3837 // -->
3838 // (csel 1, 0, invert(cc), overflow_op_bool)
3839 // ... which later gets transformed to just a cset instruction with an
3840 // inverted condition code, rather than a cset + eor sequence.
3842 // Only lower legal XALUO ops.
3844 return SDValue();
3845
3846 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3847 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3849 SDValue Value, Overflow;
3850 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3851 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3852 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3853 CCVal, Overflow);
3854 }
3855 // If neither operand is a SELECT_CC, give up.
3856 if (Sel.getOpcode() != ISD::SELECT_CC)
3857 std::swap(Sel, Other);
3858 if (Sel.getOpcode() != ISD::SELECT_CC)
3859 return Op;
3860
3861 // The folding we want to perform is:
3862 // (xor x, (select_cc a, b, cc, 0, -1) )
3863 // -->
3864 // (csel x, (xor x, -1), cc ...)
3865 //
3866 // The latter will get matched to a CSINV instruction.
3867
3868 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3869 SDValue LHS = Sel.getOperand(0);
3870 SDValue RHS = Sel.getOperand(1);
3871 SDValue TVal = Sel.getOperand(2);
3872 SDValue FVal = Sel.getOperand(3);
3873
3874 // FIXME: This could be generalized to non-integer comparisons.
3875 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3876 return Op;
3877
3878 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3879 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3880
3881 // The values aren't constants, this isn't the pattern we're looking for.
3882 if (!CFVal || !CTVal)
3883 return Op;
3884
3885 // We can commute the SELECT_CC by inverting the condition. This
3886 // might be needed to make this fit into a CSINV pattern.
3887 if (CTVal->isAllOnes() && CFVal->isZero()) {
3888 std::swap(TVal, FVal);
3889 std::swap(CTVal, CFVal);
3890 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3891 }
3892
3893 // If the constants line up, perform the transform!
3894 if (CTVal->isZero() && CFVal->isAllOnes()) {
3895 SDValue CCVal;
3896 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3897
3898 FVal = Other;
3899 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3900 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3901
3902 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3903 CCVal, Cmp);
3904 }
3905
3906 return Op;
3907}
3908
3909// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3910// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3911// sets 'C' bit to 0.
3913 SDLoc DL(Value);
3914 EVT VT = Value.getValueType();
3915 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3916 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3917 SDValue Cmp =
3918 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3919 return Cmp.getValue(1);
3920}
3921
3922// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3923// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3925 bool Invert) {
3926 assert(Glue.getResNo() == 1);
3927 SDLoc DL(Glue);
3928 SDValue Zero = DAG.getConstant(0, DL, VT);
3929 SDValue One = DAG.getConstant(1, DL, VT);
3930 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3931 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3932 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3933}
3934
3935// Value is 1 if 'V' bit of NZCV is 1, else 0
3937 assert(Glue.getResNo() == 1);
3938 SDLoc DL(Glue);
3939 SDValue Zero = DAG.getConstant(0, DL, VT);
3940 SDValue One = DAG.getConstant(1, DL, VT);
3941 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3942 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3943}
3944
3945// This lowering is inefficient, but it will get cleaned up by
3946// `foldOverflowCheck`
3948 unsigned Opcode, bool IsSigned) {
3949 EVT VT0 = Op.getValue(0).getValueType();
3950 EVT VT1 = Op.getValue(1).getValueType();
3951
3952 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3953 return SDValue();
3954
3955 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3956 SDValue OpLHS = Op.getOperand(0);
3957 SDValue OpRHS = Op.getOperand(1);
3958 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3959
3960 SDLoc DL(Op);
3961 SDVTList VTs = DAG.getVTList(VT0, VT1);
3962
3963 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3964 OpRHS, OpCarryIn);
3965
3966 SDValue OutFlag =
3967 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3968 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3969
3970 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3971}
3972
3974 // Let legalize expand this if it isn't a legal type yet.
3975 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3976 return SDValue();
3977
3978 SDLoc dl(Op);
3980 // The actual operation that sets the overflow or carry flag.
3981 SDValue Value, Overflow;
3982 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3983
3984 // We use 0 and 1 as false and true values.
3985 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3986 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3987
3988 // We use an inverted condition, because the conditional select is inverted
3989 // too. This will allow it to be selected to a single instruction:
3990 // CSINC Wd, WZR, WZR, invert(cond).
3991 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3992 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3993 CCVal, Overflow);
3994
3995 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3996 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3997}
3998
3999// Prefetch operands are:
4000// 1: Address to prefetch
4001// 2: bool isWrite
4002// 3: int locality (0 = no locality ... 3 = extreme locality)
4003// 4: bool isDataCache
4005 SDLoc DL(Op);
4006 unsigned IsWrite = Op.getConstantOperandVal(2);
4007 unsigned Locality = Op.getConstantOperandVal(3);
4008 unsigned IsData = Op.getConstantOperandVal(4);
4009
4010 bool IsStream = !Locality;
4011 // When the locality number is set
4012 if (Locality) {
4013 // The front-end should have filtered out the out-of-range values
4014 assert(Locality <= 3 && "Prefetch locality out-of-range");
4015 // The locality degree is the opposite of the cache speed.
4016 // Put the number the other way around.
4017 // The encoding starts at 0 for level 1
4018 Locality = 3 - Locality;
4019 }
4020
4021 // built the mask value encoding the expected behavior.
4022 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4023 (!IsData << 3) | // IsDataCache bit
4024 (Locality << 1) | // Cache level bits
4025 (unsigned)IsStream; // Stream bit
4026 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4027 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4028 Op.getOperand(1));
4029}
4030
4031SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4032 SelectionDAG &DAG) const {
4033 EVT VT = Op.getValueType();
4034 if (VT.isScalableVector())
4035 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4036
4037 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4038 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4039
4040 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4041 return SDValue();
4042}
4043
4044SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4045 SelectionDAG &DAG) const {
4046 EVT VT = Op.getValueType();
4047 if (VT.isScalableVector())
4048 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4049
4050 bool IsStrict = Op->isStrictFPOpcode();
4051 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4052 EVT SrcVT = SrcVal.getValueType();
4053 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4054
4055 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4056 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4057
4058 // Expand cases where the result type is BF16 but we don't have hardware
4059 // instructions to lower it.
4060 if (VT.getScalarType() == MVT::bf16 &&
4061 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4062 Subtarget->hasBF16())) {
4063 SDLoc dl(Op);
4064 SDValue Narrow = SrcVal;
4065 SDValue NaN;
4066 EVT I32 = SrcVT.changeElementType(MVT::i32);
4067 EVT F32 = SrcVT.changeElementType(MVT::f32);
4068 if (SrcVT.getScalarType() == MVT::f32) {
4069 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4070 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4071 if (!NeverSNaN) {
4072 // Set the quiet bit.
4073 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4074 DAG.getConstant(0x400000, dl, I32));
4075 }
4076 } else if (SrcVT.getScalarType() == MVT::f64) {
4077 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4078 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4079 } else {
4080 return SDValue();
4081 }
4082 if (!Trunc) {
4083 SDValue One = DAG.getConstant(1, dl, I32);
4084 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4085 DAG.getShiftAmountConstant(16, I32, dl));
4086 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4087 SDValue RoundingBias =
4088 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4089 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4090 }
4091
4092 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4093 // 0x80000000.
4094 if (NaN) {
4095 SDValue IsNaN = DAG.getSetCC(
4096 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4097 SrcVal, SrcVal, ISD::SETUO);
4098 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4099 }
4100
4101 // Now that we have rounded, shift the bits into position.
4102 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4103 DAG.getShiftAmountConstant(16, I32, dl));
4104 if (VT.isVector()) {
4105 EVT I16 = I32.changeVectorElementType(MVT::i16);
4106 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4107 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4108 }
4109 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4110 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4111 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4112 : Result;
4113 }
4114
4115 if (SrcVT != MVT::f128) {
4116 // Expand cases where the input is a vector bigger than NEON.
4118 return SDValue();
4119
4120 // It's legal except when f128 is involved
4121 return Op;
4122 }
4123
4124 return SDValue();
4125}
4126
4127SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4128 SelectionDAG &DAG) const {
4129 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4130 // Any additional optimization in this function should be recorded
4131 // in the cost tables.
4132 bool IsStrict = Op->isStrictFPOpcode();
4133 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4134 EVT VT = Op.getValueType();
4135
4136 if (VT.isScalableVector()) {
4137 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4140 return LowerToPredicatedOp(Op, DAG, Opcode);
4141 }
4142
4143 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4144 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4145 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4146
4147 unsigned NumElts = InVT.getVectorNumElements();
4148
4149 // f16 conversions are promoted to f32 when full fp16 is not supported.
4150 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4151 InVT.getVectorElementType() == MVT::bf16) {
4152 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4153 SDLoc dl(Op);
4154 if (IsStrict) {
4155 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4156 {Op.getOperand(0), Op.getOperand(1)});
4157 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4158 {Ext.getValue(1), Ext.getValue(0)});
4159 }
4160 return DAG.getNode(
4161 Op.getOpcode(), dl, Op.getValueType(),
4162 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4163 }
4164
4165 uint64_t VTSize = VT.getFixedSizeInBits();
4166 uint64_t InVTSize = InVT.getFixedSizeInBits();
4167 if (VTSize < InVTSize) {
4168 SDLoc dl(Op);
4169 if (IsStrict) {
4171 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4172 {Op.getOperand(0), Op.getOperand(1)});
4173 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4174 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4175 }
4176 SDValue Cv =
4177 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4178 Op.getOperand(0));
4179 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4180 }
4181
4182 if (VTSize > InVTSize) {
4183 SDLoc dl(Op);
4184 MVT ExtVT =
4187 if (IsStrict) {
4188 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4189 {Op.getOperand(0), Op.getOperand(1)});
4190 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4191 {Ext.getValue(1), Ext.getValue(0)});
4192 }
4193 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4194 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4195 }
4196
4197 // Use a scalar operation for conversions between single-element vectors of
4198 // the same size.
4199 if (NumElts == 1) {
4200 SDLoc dl(Op);
4201 SDValue Extract = DAG.getNode(
4203 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4204 EVT ScalarVT = VT.getScalarType();
4205 if (IsStrict)
4206 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4207 {Op.getOperand(0), Extract});
4208 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4209 }
4210
4211 // Type changing conversions are illegal.
4212 return Op;
4213}
4214
4215SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4216 SelectionDAG &DAG) const {
4217 bool IsStrict = Op->isStrictFPOpcode();
4218 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4219
4220 if (SrcVal.getValueType().isVector())
4221 return LowerVectorFP_TO_INT(Op, DAG);
4222
4223 // f16 conversions are promoted to f32 when full fp16 is not supported.
4224 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4225 SrcVal.getValueType() == MVT::bf16) {
4226 SDLoc dl(Op);
4227 if (IsStrict) {
4228 SDValue Ext =
4229 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4230 {Op.getOperand(0), SrcVal});
4231 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4232 {Ext.getValue(1), Ext.getValue(0)});
4233 }
4234 return DAG.getNode(
4235 Op.getOpcode(), dl, Op.getValueType(),
4236 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4237 }
4238
4239 if (SrcVal.getValueType() != MVT::f128) {
4240 // It's legal except when f128 is involved
4241 return Op;
4242 }
4243
4244 return SDValue();
4245}
4246
4247SDValue
4248AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4249 SelectionDAG &DAG) const {
4250 // AArch64 FP-to-int conversions saturate to the destination element size, so
4251 // we can lower common saturating conversions to simple instructions.
4252 SDValue SrcVal = Op.getOperand(0);
4253 EVT SrcVT = SrcVal.getValueType();
4254 EVT DstVT = Op.getValueType();
4255 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4256
4257 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4258 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4259 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4260 assert(SatWidth <= DstElementWidth &&
4261 "Saturation width cannot exceed result width");
4262
4263 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4264 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4265 // types, so this is hard to reach.
4266 if (DstVT.isScalableVector())
4267 return SDValue();
4268
4269 EVT SrcElementVT = SrcVT.getVectorElementType();
4270
4271 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4272 if ((SrcElementVT == MVT::f16 &&
4273 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4274 SrcElementVT == MVT::bf16) {
4275 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4276 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4277 SrcVT = F32VT;
4278 SrcElementVT = MVT::f32;
4279 SrcElementWidth = 32;
4280 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4281 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4282 return SDValue();
4283
4284 SDLoc DL(Op);
4285 // Cases that we can emit directly.
4286 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4287 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4288 DAG.getValueType(DstVT.getScalarType()));
4289
4290 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4291 // result. This is only valid if the legal cvt is larger than the saturate
4292 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4293 // (at least until sqxtn is selected).
4294 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4295 return SDValue();
4296
4297 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4298 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4299 DAG.getValueType(IntVT.getScalarType()));
4300 SDValue Sat;
4301 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4302 SDValue MinC = DAG.getConstant(
4303 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4304 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4305 SDValue MaxC = DAG.getConstant(
4306 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4307 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4308 } else {
4309 SDValue MinC = DAG.getConstant(
4310 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4311 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4312 }
4313
4314 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4315}
4316
4317SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4318 SelectionDAG &DAG) const {
4319 // AArch64 FP-to-int conversions saturate to the destination register size, so
4320 // we can lower common saturating conversions to simple instructions.
4321 SDValue SrcVal = Op.getOperand(0);
4322 EVT SrcVT = SrcVal.getValueType();
4323
4324 if (SrcVT.isVector())
4325 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4326
4327 EVT DstVT = Op.getValueType();
4328 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4329 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4330 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4331 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4332
4333 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4334 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4335 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4336 SrcVT = MVT::f32;
4337 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4338 SrcVT != MVT::bf16)
4339 return SDValue();
4340
4341 SDLoc DL(Op);
4342 // Cases that we can emit directly.
4343 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4344 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4345 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4346 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4347 DAG.getValueType(DstVT));
4348
4349 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4350 // result. This is only valid if the legal cvt is larger than the saturate
4351 // width.
4352 if (DstWidth < SatWidth)
4353 return SDValue();
4354
4355 SDValue NativeCvt =
4356 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4357 SDValue Sat;
4358 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4359 SDValue MinC = DAG.getConstant(
4360 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4361 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4362 SDValue MaxC = DAG.getConstant(
4363 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4364 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4365 } else {
4366 SDValue MinC = DAG.getConstant(
4367 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4368 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4369 }
4370
4371 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4372}
4373
4374SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4375 SelectionDAG &DAG) const {
4376 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4377 // Any additional optimization in this function should be recorded
4378 // in the cost tables.
4379 bool IsStrict = Op->isStrictFPOpcode();
4380 EVT VT = Op.getValueType();
4381 SDLoc dl(Op);
4382 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4383 EVT InVT = In.getValueType();
4384 unsigned Opc = Op.getOpcode();
4385 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4386
4387 if (VT.isScalableVector()) {
4388 if (InVT.getVectorElementType() == MVT::i1) {
4389 // We can't directly extend an SVE predicate; extend it first.
4390 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4391 EVT CastVT = getPromotedVTForPredicate(InVT);
4392 In = DAG.getNode(CastOpc, dl, CastVT, In);
4393 return DAG.getNode(Opc, dl, VT, In);
4394 }
4395
4396 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4398 return LowerToPredicatedOp(Op, DAG, Opcode);
4399 }
4400
4401 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4402 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4403 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4404
4405 // Promote bf16 conversions to f32.
4406 if (VT.getVectorElementType() == MVT::bf16) {
4407 EVT F32 = VT.changeElementType(MVT::f32);
4408 if (IsStrict) {
4409 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4410 {Op.getOperand(0), In});
4411 return DAG.getNode(
4412 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4413 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4414 }
4415 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4416 DAG.getNode(Op.getOpcode(), dl, F32, In),
4417 DAG.getIntPtrConstant(0, dl));
4418 }
4419
4420 uint64_t VTSize = VT.getFixedSizeInBits();
4421 uint64_t InVTSize = InVT.getFixedSizeInBits();
4422 if (VTSize < InVTSize) {
4423 MVT CastVT =
4425 InVT.getVectorNumElements());
4426 if (IsStrict) {
4427 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4428 {Op.getOperand(0), In});
4429 return DAG.getNode(
4430 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4431 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4432 }
4433 In = DAG.getNode(Opc, dl, CastVT, In);
4434 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4435 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4436 }
4437
4438 if (VTSize > InVTSize) {
4439 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4441 In = DAG.getNode(CastOpc, dl, CastVT, In);
4442 if (IsStrict)
4443 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4444 return DAG.getNode(Opc, dl, VT, In);
4445 }
4446
4447 // Use a scalar operation for conversions between single-element vectors of
4448 // the same size.
4449 if (VT.getVectorNumElements() == 1) {
4450 SDValue Extract = DAG.getNode(
4452 In, DAG.getConstant(0, dl, MVT::i64));
4453 EVT ScalarVT = VT.getScalarType();
4454 if (IsStrict)
4455 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4456 {Op.getOperand(0), Extract});
4457 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4458 }
4459
4460 return Op;
4461}
4462
4463SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4464 SelectionDAG &DAG) const {
4465 if (Op.getValueType().isVector())
4466 return LowerVectorINT_TO_FP(Op, DAG);
4467
4468 bool IsStrict = Op->isStrictFPOpcode();
4469 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4470
4471 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4472 Op->getOpcode() == ISD::SINT_TO_FP;
4473
4474 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4475 SDLoc dl(Op);
4476 if (IsStrict) {
4477 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4478 {Op.getOperand(0), SrcVal});
4479 return DAG.getNode(
4480 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4481 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4482 }
4483 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4484 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4485 DAG.getIntPtrConstant(0, dl));
4486 };
4487
4488 if (Op.getValueType() == MVT::bf16) {
4489 unsigned MaxWidth = IsSigned
4490 ? DAG.ComputeMaxSignificantBits(SrcVal)
4491 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4492 // bf16 conversions are promoted to f32 when converting from i16.
4493 if (MaxWidth <= 24) {
4494 return IntToFpViaPromotion(MVT::f32);
4495 }
4496
4497 // bf16 conversions are promoted to f64 when converting from i32.
4498 if (MaxWidth <= 53) {
4499 return IntToFpViaPromotion(MVT::f64);
4500 }
4501
4502 // We need to be careful about i64 -> bf16.
4503 // Consider an i32 22216703.
4504 // This number cannot be represented exactly as an f32 and so a itofp will
4505 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4506 // However, the correct bf16 was supposed to be 22151168.0
4507 // We need to use sticky rounding to get this correct.
4508 if (SrcVal.getValueType() == MVT::i64) {
4509 SDLoc DL(Op);
4510 // This algorithm is equivalent to the following:
4511 // uint64_t SrcHi = SrcVal & ~0xfffull;
4512 // uint64_t SrcLo = SrcVal & 0xfffull;
4513 // uint64_t Highest = SrcVal >> 53;
4514 // bool HasHighest = Highest != 0;
4515 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4516 // double Rounded = static_cast<double>(ToRound);
4517 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4518 // uint64_t HasLo = SrcLo != 0;
4519 // bool NeedsAdjustment = HasHighest & HasLo;
4520 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4521 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4522 // return static_cast<__bf16>(Adjusted);
4523 //
4524 // Essentially, what happens is that SrcVal either fits perfectly in a
4525 // double-precision value or it is too big. If it is sufficiently small,
4526 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4527 // ensure that u64 -> double has no rounding error by only using the 52
4528 // MSB of the input. The low order bits will get merged into a sticky bit
4529 // which will avoid issues incurred by double rounding.
4530
4531 // Signed conversion is more or less like so:
4532 // copysign((__bf16)abs(SrcVal), SrcVal)
4533 SDValue SignBit;
4534 if (IsSigned) {
4535 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4536 DAG.getConstant(1ull << 63, DL, MVT::i64));
4537 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4538 }
4539 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4540 DAG.getConstant(~0xfffull, DL, MVT::i64));
4541 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4542 DAG.getConstant(0xfffull, DL, MVT::i64));
4544 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4545 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4546 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4547 SDValue ToRound =
4548 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4549 SDValue Rounded =
4550 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4551 {Op.getOperand(0), ToRound})
4552 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4553
4554 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4555 if (SignBit) {
4556 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4557 }
4558
4559 SDValue HasHighest = DAG.getSetCC(
4560 DL,
4561 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4562 Highest, Zero64, ISD::SETNE);
4563
4564 SDValue HasLo = DAG.getSetCC(
4565 DL,
4566 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4567 SrcLo, Zero64, ISD::SETNE);
4568
4569 SDValue NeedsAdjustment =
4570 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4571 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4572
4573 SDValue AdjustedBits =
4574 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4575 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4576 return IsStrict
4578 {Op.getValueType(), MVT::Other},
4579 {Rounded.getValue(1), Adjusted,
4580 DAG.getIntPtrConstant(0, DL)})
4581 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4582 DAG.getIntPtrConstant(0, DL, true));
4583 }
4584 }
4585
4586 // f16 conversions are promoted to f32 when full fp16 is not supported.
4587 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4588 return IntToFpViaPromotion(MVT::f32);
4589 }
4590
4591 // i128 conversions are libcalls.
4592 if (SrcVal.getValueType() == MVT::i128)
4593 return SDValue();
4594
4595 // Other conversions are legal, unless it's to the completely software-based
4596 // fp128.
4597 if (Op.getValueType() != MVT::f128)
4598 return Op;
4599 return SDValue();
4600}
4601
4602SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4603 SelectionDAG &DAG) const {
4604 // For iOS, we want to call an alternative entry point: __sincos_stret,
4605 // which returns the values in two S / D registers.
4606 SDLoc dl(Op);
4607 SDValue Arg = Op.getOperand(0);
4608 EVT ArgVT = Arg.getValueType();
4609 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4610
4612 ArgListEntry Entry;
4613
4614 Entry.Node = Arg;
4615 Entry.Ty = ArgTy;
4616 Entry.IsSExt = false;
4617 Entry.IsZExt = false;
4618 Args.push_back(Entry);
4619
4620 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4621 : RTLIB::SINCOS_STRET_F32;
4622 const char *LibcallName = getLibcallName(LC);
4623 SDValue Callee =
4624 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4625
4626 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4628 CLI.setDebugLoc(dl)
4629 .setChain(DAG.getEntryNode())
4630 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4631
4632 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4633 return CallResult.first;
4634}
4635
4636static MVT getSVEContainerType(EVT ContentTy);
4637
4638SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4639 SelectionDAG &DAG) const {
4640 EVT OpVT = Op.getValueType();
4641 EVT ArgVT = Op.getOperand(0).getValueType();
4642
4644 return LowerFixedLengthBitcastToSVE(Op, DAG);
4645
4646 if (OpVT.isScalableVector()) {
4647 // Bitcasting between unpacked vector types of different element counts is
4648 // not a NOP because the live elements are laid out differently.
4649 // 01234567
4650 // e.g. nxv2i32 = XX??XX??
4651 // nxv4f16 = X?X?X?X?
4652 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4653 return SDValue();
4654
4655 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4656 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4657 "Expected int->fp bitcast!");
4658 SDValue ExtResult =
4660 Op.getOperand(0));
4661 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4662 }
4663 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4664 }
4665
4666 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4667 return SDValue();
4668
4669 // Bitcasts between f16 and bf16 are legal.
4670 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4671 return Op;
4672
4673 assert(ArgVT == MVT::i16);
4674 SDLoc DL(Op);
4675
4676 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4677 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4678 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4679}
4680
4681static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4682 if (OrigVT.getSizeInBits() >= 64)
4683 return OrigVT;
4684
4685 assert(OrigVT.isSimple() && "Expecting a simple value type");
4686
4687 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4688 switch (OrigSimpleTy) {
4689 default: llvm_unreachable("Unexpected Vector Type");
4690 case MVT::v2i8:
4691 case MVT::v2i16:
4692 return MVT::v2i32;
4693 case MVT::v4i8:
4694 return MVT::v4i16;
4695 }
4696}
4697
4699 const EVT &OrigTy,
4700 const EVT &ExtTy,
4701 unsigned ExtOpcode) {
4702 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4703 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4704 // 64-bits we need to insert a new extension so that it will be 64-bits.
4705 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4706 if (OrigTy.getSizeInBits() >= 64)
4707 return N;
4708
4709 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4710 EVT NewVT = getExtensionTo64Bits(OrigTy);
4711
4712 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4713}
4714
4715// Returns lane if Op extracts from a two-element vector and lane is constant
4716// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4717static std::optional<uint64_t>
4719 SDNode *OpNode = Op.getNode();
4720 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4721 return std::nullopt;
4722
4723 EVT VT = OpNode->getOperand(0).getValueType();
4724 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4725 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4726 return std::nullopt;
4727
4728 return C->getZExtValue();
4729}
4730
4732 bool isSigned) {
4733 EVT VT = N.getValueType();
4734
4735 if (N.getOpcode() != ISD::BUILD_VECTOR)
4736 return false;
4737
4738 for (const SDValue &Elt : N->op_values()) {
4739 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4740 unsigned EltSize = VT.getScalarSizeInBits();
4741 unsigned HalfSize = EltSize / 2;
4742 if (isSigned) {
4743 if (!isIntN(HalfSize, C->getSExtValue()))
4744 return false;
4745 } else {
4746 if (!isUIntN(HalfSize, C->getZExtValue()))
4747 return false;
4748 }
4749 continue;
4750 }
4751 return false;
4752 }
4753
4754 return true;
4755}
4756
4758 EVT VT = N.getValueType();
4759 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4760
4761 unsigned NumElts = VT.getVectorNumElements();
4762 unsigned OrigEltSize = VT.getScalarSizeInBits();
4763 unsigned EltSize = OrigEltSize / 2;
4764 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4765
4766 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4767 if (DAG.MaskedValueIsZero(N, HiBits))
4768 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4769
4770 if (ISD::isExtOpcode(N.getOpcode()))
4771 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4772 N.getOperand(0).getValueType(), VT,
4773 N.getOpcode());
4774
4775 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4776 SDLoc dl(N);
4778 for (unsigned i = 0; i != NumElts; ++i) {
4779 const APInt &CInt = N.getConstantOperandAPInt(i);
4780 // Element types smaller than 32 bits are not legal, so use i32 elements.
4781 // The values are implicitly truncated so sext vs. zext doesn't matter.
4782 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4783 }
4784 return DAG.getBuildVector(TruncVT, dl, Ops);
4785}
4786
4788 return N.getOpcode() == ISD::SIGN_EXTEND ||
4789 N.getOpcode() == ISD::ANY_EXTEND ||
4790 isExtendedBUILD_VECTOR(N, DAG, true);
4791}
4792
4794 return N.getOpcode() == ISD::ZERO_EXTEND ||
4795 N.getOpcode() == ISD::ANY_EXTEND ||
4796 isExtendedBUILD_VECTOR(N, DAG, false);
4797}
4798
4800 unsigned Opcode = N.getOpcode();
4801 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4802 SDValue N0 = N.getOperand(0);
4803 SDValue N1 = N.getOperand(1);
4804 return N0->hasOneUse() && N1->hasOneUse() &&
4805 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4806 }
4807 return false;
4808}
4809
4811 unsigned Opcode = N.getOpcode();
4812 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4813 SDValue N0 = N.getOperand(0);
4814 SDValue N1 = N.getOperand(1);
4815 return N0->hasOneUse() && N1->hasOneUse() &&
4816 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4817 }
4818 return false;
4819}
4820
4821SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4822 SelectionDAG &DAG) const {
4823 // The rounding mode is in bits 23:22 of the FPSCR.
4824 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4825 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4826 // so that the shift + and get folded into a bitfield extract.
4827 SDLoc dl(Op);
4828
4829 SDValue Chain = Op.getOperand(0);
4830 SDValue FPCR_64 = DAG.getNode(
4831 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4832 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4833 Chain = FPCR_64.getValue(1);
4834 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4835 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4836 DAG.getConstant(1U << 22, dl, MVT::i32));
4837 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4838 DAG.getConstant(22, dl, MVT::i32));
4839 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4840 DAG.getConstant(3, dl, MVT::i32));
4841 return DAG.getMergeValues({AND, Chain}, dl);
4842}
4843
4844SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4845 SelectionDAG &DAG) const {
4846 SDLoc DL(Op);
4847 SDValue Chain = Op->getOperand(0);
4848 SDValue RMValue = Op->getOperand(1);
4849
4850 // The rounding mode is in bits 23:22 of the FPCR.
4851 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4852 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4853 // ((arg - 1) & 3) << 22).
4854 //
4855 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4856 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4857 // generated llvm.set.rounding to ensure this condition.
4858
4859 // Calculate new value of FPCR[23:22].
4860 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4861 DAG.getConstant(1, DL, MVT::i32));
4862 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4863 DAG.getConstant(0x3, DL, MVT::i32));
4864 RMValue =
4865 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4866 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4867 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4868
4869 // Get current value of FPCR.
4870 SDValue Ops[] = {
4871 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4872 SDValue FPCR =
4873 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4874 Chain = FPCR.getValue(1);
4875 FPCR = FPCR.getValue(0);
4876
4877 // Put new rounding mode into FPSCR[23:22].
4878 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4879 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4880 DAG.getConstant(RMMask, DL, MVT::i64));
4881 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4882 SDValue Ops2[] = {
4883 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4884 FPCR};
4885 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4886}
4887
4888static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4889 SDLoc DL, bool &IsMLA) {
4890 bool IsN0SExt = isSignExtended(N0, DAG);
4891 bool IsN1SExt = isSignExtended(N1, DAG);
4892 if (IsN0SExt && IsN1SExt)
4893 return AArch64ISD::SMULL;
4894
4895 bool IsN0ZExt = isZeroExtended(N0, DAG);
4896 bool IsN1ZExt = isZeroExtended(N1, DAG);
4897
4898 if (IsN0ZExt && IsN1ZExt)
4899 return AArch64ISD::UMULL;
4900
4901 // Select SMULL if we can replace zext with sext.
4902 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4903 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4904 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4905 SDValue ZextOperand;
4906 if (IsN0ZExt)
4907 ZextOperand = N0.getOperand(0);
4908 else
4909 ZextOperand = N1.getOperand(0);
4910 if (DAG.SignBitIsZero(ZextOperand)) {
4911 SDValue NewSext =
4912 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4913 if (IsN0ZExt)
4914 N0 = NewSext;
4915 else
4916 N1 = NewSext;
4917 return AArch64ISD::SMULL;
4918 }
4919 }
4920
4921 // Select UMULL if we can replace the other operand with an extend.
4922 if (IsN0ZExt || IsN1ZExt) {
4923 EVT VT = N0.getValueType();
4925 VT.getScalarSizeInBits() / 2);
4926 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4927 return AArch64ISD::UMULL;
4928 }
4929
4930 if (!IsN1SExt && !IsN1ZExt)
4931 return 0;
4932
4933 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4934 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4935 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4936 IsMLA = true;
4937 return AArch64ISD::SMULL;
4938 }
4939 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4940 IsMLA = true;
4941 return AArch64ISD::UMULL;
4942 }
4943 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4944 std::swap(N0, N1);
4945 IsMLA = true;
4946 return AArch64ISD::UMULL;
4947 }
4948 return 0;
4949}
4950
4951SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4952 EVT VT = Op.getValueType();
4953
4954 bool OverrideNEON = !Subtarget->isNeonAvailable();
4955 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4956 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4957
4958 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4959 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4960 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
4961 "unexpected type for custom-lowering ISD::MUL");
4962 SDValue N0 = Op.getOperand(0);
4963 SDValue N1 = Op.getOperand(1);
4964 bool isMLA = false;
4965 EVT OVT = VT;
4966 if (VT.is64BitVector()) {
4967 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4968 isNullConstant(N0.getOperand(1)) &&
4970 isNullConstant(N1.getOperand(1))) {
4971 N0 = N0.getOperand(0);
4972 N1 = N1.getOperand(0);
4973 VT = N0.getValueType();
4974 } else {
4975 if (VT == MVT::v1i64) {
4976 if (Subtarget->hasSVE())
4977 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4978 // Fall through to expand this. It is not legal.
4979 return SDValue();
4980 } else
4981 // Other vector multiplications are legal.
4982 return Op;
4983 }
4984 }
4985
4986 SDLoc DL(Op);
4987 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
4988
4989 if (!NewOpc) {
4990 if (VT.getVectorElementType() == MVT::i64) {
4991 // If SVE is available then i64 vector multiplications can also be made
4992 // legal.
4993 if (Subtarget->hasSVE())
4994 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4995 // Fall through to expand this. It is not legal.
4996 return SDValue();
4997 } else
4998 // Other vector multiplications are legal.
4999 return Op;
5000 }
5001
5002 // Legalize to a S/UMULL instruction
5003 SDValue Op0;
5004 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5005 if (!isMLA) {
5006 Op0 = skipExtensionForVectorMULL(N0, DAG);
5008 Op1.getValueType().is64BitVector() &&
5009 "unexpected types for extended operands to VMULL");
5010 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5011 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5012 DAG.getConstant(0, DL, MVT::i64));
5013 }
5014 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5015 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5016 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5019 EVT Op1VT = Op1.getValueType();
5020 return DAG.getNode(
5022 DAG.getNode(N0.getOpcode(), DL, VT,
5023 DAG.getNode(NewOpc, DL, VT,
5024 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5025 DAG.getNode(NewOpc, DL, VT,
5026 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5027 DAG.getConstant(0, DL, MVT::i64));
5028}
5029
5030static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5031 int Pattern) {
5032 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5033 return DAG.getConstant(1, DL, MVT::nxv1i1);
5034 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5035 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5036}
5037
5039 bool IsSigned, bool IsEqual) {
5040 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5041 !isa<ConstantSDNode>(Op.getOperand(2)))
5042 return SDValue();
5043
5044 SDLoc dl(Op);
5045 APInt X = Op.getConstantOperandAPInt(1);
5046 APInt Y = Op.getConstantOperandAPInt(2);
5047 bool Overflow;
5048 APInt NumActiveElems =
5049 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5050
5051 if (Overflow)
5052 return SDValue();
5053
5054 if (IsEqual) {
5055 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5056 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5057 : NumActiveElems.uadd_ov(One, Overflow);
5058 if (Overflow)
5059 return SDValue();
5060 }
5061
5062 std::optional<unsigned> PredPattern =
5064 unsigned MinSVEVectorSize = std::max(
5066 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5067 if (PredPattern != std::nullopt &&
5068 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5069 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5070
5071 return SDValue();
5072}
5073
5074// Returns a safe bitcast between two scalable vector predicates, where
5075// any newly created lanes from a widening bitcast are defined as zero.
5077 SDLoc DL(Op);
5078 EVT InVT = Op.getValueType();
5079
5080 assert(InVT.getVectorElementType() == MVT::i1 &&
5081 VT.getVectorElementType() == MVT::i1 &&
5082 "Expected a predicate-to-predicate bitcast");
5084 InVT.isScalableVector() &&
5085 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5086 "Only expect to cast between legal scalable predicate types!");
5087
5088 // Return the operand if the cast isn't changing type,
5089 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5090 if (InVT == VT)
5091 return Op;
5092
5093 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5094
5095 // We only have to zero the lanes if new lanes are being defined, e.g. when
5096 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5097 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5098 // we can return here.
5099 if (InVT.bitsGT(VT))
5100 return Reinterpret;
5101
5102 // Check if the other lanes are already known to be zeroed by
5103 // construction.
5105 return Reinterpret;
5106
5107 // Zero the newly introduced lanes.
5108 SDValue Mask = DAG.getConstant(1, DL, InVT);
5109 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5110 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5111}
5112
5113SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5114 SDValue Chain, SDLoc DL,
5115 EVT VT) const {
5116 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5118 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5119 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5122 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5124 RetTy, Callee, std::move(Args));
5125 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5126 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5127 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5128 Mask);
5129}
5130
5131// Lower an SME LDR/STR ZA intrinsic
5132// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5133// folded into the instruction
5134// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5135// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5136// and tile slice registers
5137// ldr(%tileslice, %ptr, %vecnum)
5138// ->
5139// %svl = rdsvl
5140// %ptr2 = %ptr + %svl * %vecnum
5141// %tileslice2 = %tileslice + %vecnum
5142// ldr [%tileslice2, 0], [%ptr2, 0]
5143// Case 3: If the vecnum is an immediate out of range, then the same is done as
5144// case 2, but the base and slice registers are modified by the greatest
5145// multiple of 15 lower than the vecnum and the remainder is folded into the
5146// instruction. This means that successive loads and stores that are offset from
5147// each other can share the same base and slice register updates.
5148// ldr(%tileslice, %ptr, 22)
5149// ldr(%tileslice, %ptr, 23)
5150// ->
5151// %svl = rdsvl
5152// %ptr2 = %ptr + %svl * 15
5153// %tileslice2 = %tileslice + 15
5154// ldr [%tileslice2, 7], [%ptr2, 7]
5155// ldr [%tileslice2, 8], [%ptr2, 8]
5156// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5157// operand and the immediate can be folded into the instruction, like case 2.
5158// ldr(%tileslice, %ptr, %vecnum + 7)
5159// ldr(%tileslice, %ptr, %vecnum + 8)
5160// ->
5161// %svl = rdsvl
5162// %ptr2 = %ptr + %svl * %vecnum
5163// %tileslice2 = %tileslice + %vecnum
5164// ldr [%tileslice2, 7], [%ptr2, 7]
5165// ldr [%tileslice2, 8], [%ptr2, 8]
5166// Case 5: The vecnum being an add of an immediate out of range is also handled,
5167// in which case the same remainder logic as case 3 is used.
5169 SDLoc DL(N);
5170
5171 SDValue TileSlice = N->getOperand(2);
5172 SDValue Base = N->getOperand(3);
5173 SDValue VecNum = N->getOperand(4);
5174 int32_t ConstAddend = 0;
5175 SDValue VarAddend = VecNum;
5176
5177 // If the vnum is an add of an immediate, we can fold it into the instruction
5178 if (VecNum.getOpcode() == ISD::ADD &&
5179 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5180 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5181 VarAddend = VecNum.getOperand(0);
5182 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5183 ConstAddend = ImmNode->getSExtValue();
5184 VarAddend = SDValue();
5185 }
5186
5187 int32_t ImmAddend = ConstAddend % 16;
5188 if (int32_t C = (ConstAddend - ImmAddend)) {
5189 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5190 VarAddend = VarAddend
5191 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5192 : CVal;
5193 }
5194
5195 if (VarAddend) {
5196 // Get the vector length that will be multiplied by vnum
5197 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5198 DAG.getConstant(1, DL, MVT::i32));
5199
5200 // Multiply SVL and vnum then add it to the base
5201 SDValue Mul = DAG.getNode(
5202 ISD::MUL, DL, MVT::i64,
5203 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5204 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5205 // Just add vnum to the tileslice
5206 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5207 }
5208
5210 DL, MVT::Other,
5211 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5212 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5213}
5214
5215SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5216 SelectionDAG &DAG) const {
5217 unsigned IntNo = Op.getConstantOperandVal(1);
5218 SDLoc DL(Op);
5219 switch (IntNo) {
5220 default:
5221 return SDValue(); // Don't custom lower most intrinsics.
5222 case Intrinsic::aarch64_prefetch: {
5223 SDValue Chain = Op.getOperand(0);
5224 SDValue Addr = Op.getOperand(2);
5225
5226 unsigned IsWrite = Op.getConstantOperandVal(3);
5227 unsigned Locality = Op.getConstantOperandVal(4);
5228 unsigned IsStream = Op.getConstantOperandVal(5);
5229 unsigned IsData = Op.getConstantOperandVal(6);
5230 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5231 (!IsData << 3) | // IsDataCache bit
5232 (Locality << 1) | // Cache level bits
5233 (unsigned)IsStream; // Stream bit
5234
5235 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5236 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5237 }
5238 case Intrinsic::aarch64_sme_str:
5239 case Intrinsic::aarch64_sme_ldr: {
5240 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5241 }
5242 case Intrinsic::aarch64_sme_za_enable:
5243 return DAG.getNode(
5244 AArch64ISD::SMSTART, DL, MVT::Other,
5245 Op->getOperand(0), // Chain
5246 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5247 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5248 case Intrinsic::aarch64_sme_za_disable:
5249 return DAG.getNode(
5250 AArch64ISD::SMSTOP, DL, MVT::Other,
5251 Op->getOperand(0), // Chain
5252 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5253 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5254 }
5255}
5256
5257SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5258 SelectionDAG &DAG) const {
5259 unsigned IntNo = Op.getConstantOperandVal(1);
5260 SDLoc DL(Op);
5261 switch (IntNo) {
5262 default:
5263 return SDValue(); // Don't custom lower most intrinsics.
5264 case Intrinsic::aarch64_mops_memset_tag: {
5265 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5266 SDValue Chain = Node->getChain();
5267 SDValue Dst = Op.getOperand(2);
5268 SDValue Val = Op.getOperand(3);
5269 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5270 SDValue Size = Op.getOperand(4);
5271 auto Alignment = Node->getMemOperand()->getAlign();
5272 bool IsVol = Node->isVolatile();
5273 auto DstPtrInfo = Node->getPointerInfo();
5274
5275 const auto &SDI =
5276 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5277 SDValue MS =
5278 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5279 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5280
5281 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5282 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5283 // LowerOperationWrapper will complain that the number of results has
5284 // changed.
5285 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5286 }
5287 }
5288}
5289
5290SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5291 SelectionDAG &DAG) const {
5292 unsigned IntNo = Op.getConstantOperandVal(0);
5293 SDLoc dl(Op);
5294 switch (IntNo) {
5295 default: return SDValue(); // Don't custom lower most intrinsics.
5296 case Intrinsic::thread_pointer: {
5297 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5298 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5299 }
5300 case Intrinsic::aarch64_neon_abs: {
5301 EVT Ty = Op.getValueType();
5302 if (Ty == MVT::i64) {
5303 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5304 Op.getOperand(1));
5305 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5306 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5307 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5308 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5309 } else {
5310 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5311 }
5312 }
5313 case Intrinsic::aarch64_neon_pmull64: {
5314 SDValue LHS = Op.getOperand(1);
5315 SDValue RHS = Op.getOperand(2);
5316
5317 std::optional<uint64_t> LHSLane =
5319 std::optional<uint64_t> RHSLane =
5321
5322 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5323 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5324
5325 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5326 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5327 // which ISel recognizes better. For example, generate a ldr into d*
5328 // registers as opposed to a GPR load followed by a fmov.
5329 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5330 std::optional<uint64_t> OtherLane,
5331 const SDLoc &dl,
5332 SelectionDAG &DAG) -> SDValue {
5333 // If the operand is an higher half itself, rewrite it to
5334 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5335 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5336 if (NLane && *NLane == 1)
5337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5338 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5339
5340 // Operand N is not a higher half but the other operand is.
5341 if (OtherLane && *OtherLane == 1) {
5342 // If this operand is a lower half, rewrite it to
5343 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5344 // align lanes of two operands. A roundtrip sequence (to move from lane
5345 // 1 to lane 0) is like this:
5346 // mov x8, v0.d[1]
5347 // fmov d0, x8
5348 if (NLane && *NLane == 0)
5349 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5350 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5351 N.getOperand(0),
5352 DAG.getConstant(0, dl, MVT::i64)),
5353 DAG.getConstant(1, dl, MVT::i64));
5354
5355 // Otherwise just dup from main to all lanes.
5356 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5357 }
5358
5359 // Neither operand is an extract of higher half, so codegen may just use
5360 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5361 assert(N.getValueType() == MVT::i64 &&
5362 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5363 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5364 };
5365
5366 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5367 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5368
5369 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5370 }
5371 case Intrinsic::aarch64_neon_smax:
5372 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5373 Op.getOperand(1), Op.getOperand(2));
5374 case Intrinsic::aarch64_neon_umax:
5375 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5376 Op.getOperand(1), Op.getOperand(2));
5377 case Intrinsic::aarch64_neon_smin:
5378 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5379 Op.getOperand(1), Op.getOperand(2));
5380 case Intrinsic::aarch64_neon_umin:
5381 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5382 Op.getOperand(1), Op.getOperand(2));
5383 case Intrinsic::aarch64_neon_scalar_sqxtn:
5384 case Intrinsic::aarch64_neon_scalar_sqxtun:
5385 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5386 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5387 if (Op.getValueType() == MVT::i32)
5388 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5389 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5390 Op.getOperand(0),
5391 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5392 Op.getOperand(1))));
5393 return SDValue();
5394 }
5395 case Intrinsic::aarch64_sve_whilelo:
5396 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5397 /*IsEqual=*/false);
5398 case Intrinsic::aarch64_sve_whilelt:
5399 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5400 /*IsEqual=*/false);
5401 case Intrinsic::aarch64_sve_whilels:
5402 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5403 /*IsEqual=*/true);
5404 case Intrinsic::aarch64_sve_whilele:
5405 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5406 /*IsEqual=*/true);
5407 case Intrinsic::aarch64_sve_sunpkhi:
5408 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5409 Op.getOperand(1));
5410 case Intrinsic::aarch64_sve_sunpklo:
5411 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5412 Op.getOperand(1));
5413 case Intrinsic::aarch64_sve_uunpkhi:
5414 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5415 Op.getOperand(1));
5416 case Intrinsic::aarch64_sve_uunpklo:
5417 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5418 Op.getOperand(1));
5419 case Intrinsic::aarch64_sve_clasta_n:
5420 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5421 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5422 case Intrinsic::aarch64_sve_clastb_n:
5423 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5424 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5425 case Intrinsic::aarch64_sve_lasta:
5426 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5427 Op.getOperand(1), Op.getOperand(2));
5428 case Intrinsic::aarch64_sve_lastb:
5429 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5430 Op.getOperand(1), Op.getOperand(2));
5431 case Intrinsic::aarch64_sve_rev:
5432 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5433 Op.getOperand(1));
5434 case Intrinsic::aarch64_sve_tbl:
5435 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5436 Op.getOperand(1), Op.getOperand(2));
5437 case Intrinsic::aarch64_sve_trn1:
5438 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5439 Op.getOperand(1), Op.getOperand(2));
5440 case Intrinsic::aarch64_sve_trn2:
5441 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5442 Op.getOperand(1), Op.getOperand(2));
5443 case Intrinsic::aarch64_sve_uzp1:
5444 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5445 Op.getOperand(1), Op.getOperand(2));
5446 case Intrinsic::aarch64_sve_uzp2:
5447 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5448 Op.getOperand(1), Op.getOperand(2));
5449 case Intrinsic::aarch64_sve_zip1:
5450 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5451 Op.getOperand(1), Op.getOperand(2));
5452 case Intrinsic::aarch64_sve_zip2:
5453 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5454 Op.getOperand(1), Op.getOperand(2));
5455 case Intrinsic::aarch64_sve_splice:
5456 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5457 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5458 case Intrinsic::aarch64_sve_ptrue:
5459 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5460 case Intrinsic::aarch64_sve_clz:
5461 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5462 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5463 case Intrinsic::aarch64_sme_cntsb:
5464 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5465 DAG.getConstant(1, dl, MVT::i32));
5466 case Intrinsic::aarch64_sme_cntsh: {
5467 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5468 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5469 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5470 }
5471 case Intrinsic::aarch64_sme_cntsw: {
5472 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5473 DAG.getConstant(1, dl, MVT::i32));
5474 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5475 DAG.getConstant(2, dl, MVT::i32));
5476 }
5477 case Intrinsic::aarch64_sme_cntsd: {
5478 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5479 DAG.getConstant(1, dl, MVT::i32));
5480 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5481 DAG.getConstant(3, dl, MVT::i32));
5482 }
5483 case Intrinsic::aarch64_sve_cnt: {
5484 SDValue Data = Op.getOperand(3);
5485 // CTPOP only supports integer operands.
5486 if (Data.getValueType().isFloatingPoint())
5487 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5488 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5489 Op.getOperand(2), Data, Op.getOperand(1));
5490 }
5491 case Intrinsic::aarch64_sve_dupq_lane:
5492 return LowerDUPQLane(Op, DAG);
5493 case Intrinsic::aarch64_sve_convert_from_svbool:
5494 if (Op.getValueType() == MVT::aarch64svcount)
5495 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5496 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5497 case Intrinsic::aarch64_sve_convert_to_svbool:
5498 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5499 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5500 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5501 case Intrinsic::aarch64_sve_fneg:
5502 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5503 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5504 case Intrinsic::aarch64_sve_frintp:
5505 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5506 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5507 case Intrinsic::aarch64_sve_frintm:
5508 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5509 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5510 case Intrinsic::aarch64_sve_frinti:
5511 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5512 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5513 case Intrinsic::aarch64_sve_frintx:
5514 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5515 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5516 case Intrinsic::aarch64_sve_frinta:
5517 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5518 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5519 case Intrinsic::aarch64_sve_frintn:
5520 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5521 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5522 case Intrinsic::aarch64_sve_frintz:
5523 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5524 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5525 case Intrinsic::aarch64_sve_ucvtf:
5527 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5528 Op.getOperand(1));
5529 case Intrinsic::aarch64_sve_scvtf:
5531 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5532 Op.getOperand(1));
5533 case Intrinsic::aarch64_sve_fcvtzu:
5535 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5536 Op.getOperand(1));
5537 case Intrinsic::aarch64_sve_fcvtzs:
5539 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5540 Op.getOperand(1));
5541 case Intrinsic::aarch64_sve_fsqrt:
5542 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5543 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5544 case Intrinsic::aarch64_sve_frecpx:
5545 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5546 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5547 case Intrinsic::aarch64_sve_frecpe_x:
5548 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5549 Op.getOperand(1));
5550 case Intrinsic::aarch64_sve_frecps_x:
5551 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5552 Op.getOperand(1), Op.getOperand(2));
5553 case Intrinsic::aarch64_sve_frsqrte_x:
5554 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5555 Op.getOperand(1));
5556 case Intrinsic::aarch64_sve_frsqrts_x:
5557 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5558 Op.getOperand(1), Op.getOperand(2));
5559 case Intrinsic::aarch64_sve_fabs:
5560 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5561 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5562 case Intrinsic::aarch64_sve_abs:
5563 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5564 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5565 case Intrinsic::aarch64_sve_neg:
5566 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5567 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5568 case Intrinsic::aarch64_sve_insr: {
5569 SDValue Scalar = Op.getOperand(2);
5570 EVT ScalarTy = Scalar.getValueType();
5571 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5572 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5573
5574 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5575 Op.getOperand(1), Scalar);
5576 }
5577 case Intrinsic::aarch64_sve_rbit:
5579 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5580 Op.getOperand(1));
5581 case Intrinsic::aarch64_sve_revb:
5582 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5583 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5584 case Intrinsic::aarch64_sve_revh:
5585 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5586 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5587 case Intrinsic::aarch64_sve_revw:
5588 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5589 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5590 case Intrinsic::aarch64_sve_revd:
5591 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5592 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5593 case Intrinsic::aarch64_sve_sxtb:
5594 return DAG.getNode(
5596 Op.getOperand(2), Op.getOperand(3),
5597 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5598 Op.getOperand(1));
5599 case Intrinsic::aarch64_sve_sxth:
5600 return DAG.getNode(
5602 Op.getOperand(2), Op.getOperand(3),
5603 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5604 Op.getOperand(1));
5605 case Intrinsic::aarch64_sve_sxtw:
5606 return DAG.getNode(
5608 Op.getOperand(2), Op.getOperand(3),
5609 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5610 Op.getOperand(1));
5611 case Intrinsic::aarch64_sve_uxtb:
5612 return DAG.getNode(
5614 Op.getOperand(2), Op.getOperand(3),
5615 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5616 Op.getOperand(1));
5617 case Intrinsic::aarch64_sve_uxth:
5618 return DAG.getNode(
5620 Op.getOperand(2), Op.getOperand(3),
5621 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5622 Op.getOperand(1));
5623 case Intrinsic::aarch64_sve_uxtw:
5624 return DAG.getNode(
5626 Op.getOperand(2), Op.getOperand(3),
5627 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5628 Op.getOperand(1));
5629 case Intrinsic::localaddress: {
5630 const auto &MF = DAG.getMachineFunction();
5631 const auto *RegInfo = Subtarget->getRegisterInfo();
5632 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5633 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5634 Op.getSimpleValueType());
5635 }
5636
5637 case Intrinsic::eh_recoverfp: {
5638 // FIXME: This needs to be implemented to correctly handle highly aligned
5639 // stack objects. For now we simply return the incoming FP. Refer D53541
5640 // for more details.
5641 SDValue FnOp = Op.getOperand(1);
5642 SDValue IncomingFPOp = Op.getOperand(2);
5643 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5644 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5645 if (!Fn)
5647 "llvm.eh.recoverfp must take a function as the first argument");
5648 return IncomingFPOp;
5649 }
5650
5651 case Intrinsic::aarch64_neon_vsri:
5652 case Intrinsic::aarch64_neon_vsli:
5653 case Intrinsic::aarch64_sve_sri:
5654 case Intrinsic::aarch64_sve_sli: {
5655 EVT Ty = Op.getValueType();
5656
5657 if (!Ty.isVector())
5658 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5659
5660 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5661
5662 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5663 IntNo == Intrinsic::aarch64_sve_sri;
5664 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5665 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5666 Op.getOperand(3));
5667 }
5668
5669 case Intrinsic::aarch64_neon_srhadd:
5670 case Intrinsic::aarch64_neon_urhadd:
5671 case Intrinsic::aarch64_neon_shadd:
5672 case Intrinsic::aarch64_neon_uhadd: {
5673 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5674 IntNo == Intrinsic::aarch64_neon_shadd);
5675 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5676 IntNo == Intrinsic::aarch64_neon_urhadd);
5677 unsigned Opcode = IsSignedAdd
5678 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5679 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5680 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5681 Op.getOperand(2));
5682 }
5683 case Intrinsic::aarch64_neon_saddlp:
5684 case Intrinsic::aarch64_neon_uaddlp: {
5685 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5688 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5689 }
5690 case Intrinsic::aarch64_neon_sdot:
5691 case Intrinsic::aarch64_neon_udot:
5692 case Intrinsic::aarch64_sve_sdot:
5693 case Intrinsic::aarch64_sve_udot: {
5694 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5695 IntNo == Intrinsic::aarch64_sve_udot)
5698 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5699 Op.getOperand(2), Op.getOperand(3));
5700 }
5701 case Intrinsic::get_active_lane_mask: {
5702 SDValue ID =
5703 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5704 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5705 Op.getOperand(1), Op.getOperand(2));
5706 }
5707 case Intrinsic::aarch64_neon_uaddlv: {
5708 EVT OpVT = Op.getOperand(1).getValueType();
5709 EVT ResVT = Op.getValueType();
5710 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5711 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5712 // In order to avoid insert_subvector, used v4i32 than v2i32.
5713 SDValue UADDLV =
5714 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5715 SDValue EXTRACT_VEC_ELT =
5716 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5717 DAG.getConstant(0, dl, MVT::i64));
5718 return EXTRACT_VEC_ELT;
5719 }
5720 return SDValue();
5721 }
5722 case Intrinsic::experimental_cttz_elts: {
5723 SDValue NewCttzElts =
5724 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5725
5726 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5727 }
5728 }
5729}
5730
5731bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5732 if (VT.getVectorElementType() == MVT::i8 ||
5733 VT.getVectorElementType() == MVT::i16) {
5734 EltTy = MVT::i32;
5735 return true;
5736 }
5737 return false;
5738}
5739
5740bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5741 EVT DataVT) const {
5742 const EVT IndexVT = Extend.getOperand(0).getValueType();
5743 // SVE only supports implicit extension of 32-bit indices.
5744 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5745 return false;
5746
5747 // Indices cannot be smaller than the main data type.
5748 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5749 return false;
5750
5751 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5752 // element container type, which would violate the previous clause.
5753 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5754}
5755
5756bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5757 EVT ExtVT = ExtVal.getValueType();
5758 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5759 return false;
5760
5761 // It may be worth creating extending masked loads if there are multiple
5762 // masked loads using the same predicate. That way we'll end up creating
5763 // extending masked loads that may then get split by the legaliser. This
5764 // results in just one set of predicate unpacks at the start, instead of
5765 // multiple sets of vector unpacks after each load.
5766 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5767 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5768 // Disable extending masked loads for fixed-width for now, since the code
5769 // quality doesn't look great.
5770 if (!ExtVT.isScalableVector())
5771 return false;
5772
5773 unsigned NumExtMaskedLoads = 0;
5774 for (auto *U : Ld->getMask()->uses())
5775 if (isa<MaskedLoadSDNode>(U))
5776 NumExtMaskedLoads++;
5777
5778 if (NumExtMaskedLoads <= 1)
5779 return false;
5780 }
5781 }
5782
5783 return true;
5784}
5785
5786unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5787 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5788 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5790 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5792 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5794 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5796 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5798 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5800 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5802 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5804 };
5805 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5806 return AddrModes.find(Key)->second;
5807}
5808
5809unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5810 switch (Opcode) {
5811 default:
5812 llvm_unreachable("unimplemented opcode");
5813 return Opcode;
5828 }
5829}
5830
5831SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5832 SelectionDAG &DAG) const {
5833 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5834
5835 SDLoc DL(Op);
5836 SDValue Chain = MGT->getChain();
5837 SDValue PassThru = MGT->getPassThru();
5838 SDValue Mask = MGT->getMask();
5839 SDValue BasePtr = MGT->getBasePtr();
5840 SDValue Index = MGT->getIndex();
5841 SDValue Scale = MGT->getScale();
5842 EVT VT = Op.getValueType();
5843 EVT MemVT = MGT->getMemoryVT();
5844 ISD::LoadExtType ExtType = MGT->getExtensionType();
5845 ISD::MemIndexType IndexType = MGT->getIndexType();
5846
5847 // SVE supports zero (and so undef) passthrough values only, everything else
5848 // must be handled manually by an explicit select on the load's output.
5849 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5850 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5851 SDValue Load =
5852 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5853 MGT->getMemOperand(), IndexType, ExtType);
5854 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5855 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5856 }
5857
5858 bool IsScaled = MGT->isIndexScaled();
5859 bool IsSigned = MGT->isIndexSigned();
5860
5861 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5862 // must be calculated before hand.
5863 uint64_t ScaleVal = Scale->getAsZExtVal();
5864 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5865 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5866 EVT IndexVT = Index.getValueType();
5867 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5868 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5869 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5870
5871 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5872 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5873 MGT->getMemOperand(), IndexType, ExtType);
5874 }
5875
5876 // Lower fixed length gather to a scalable equivalent.
5877 if (VT.isFixedLengthVector()) {
5878 assert(Subtarget->useSVEForFixedLengthVectors() &&
5879 "Cannot lower when not using SVE for fixed vectors!");
5880
5881 // NOTE: Handle floating-point as if integer then bitcast the result.
5883 MemVT = MemVT.changeVectorElementTypeToInteger();
5884
5885 // Find the smallest integer fixed length vector we can use for the gather.
5886 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5887 if (DataVT.getVectorElementType() == MVT::i64 ||
5888 Index.getValueType().getVectorElementType() == MVT::i64 ||
5889 Mask.getValueType().getVectorElementType() == MVT::i64)
5890 PromotedVT = VT.changeVectorElementType(MVT::i64);
5891
5892 // Promote vector operands except for passthrough, which we know is either
5893 // undef or zero, and thus best constructed directly.
5894 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5895 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5896 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5897
5898 // A promoted result type forces the need for an extending load.
5899 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5900 ExtType = ISD::EXTLOAD;
5901
5902 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5903
5904 // Convert fixed length vector operands to scalable.
5905 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5906 Index = convertToScalableVector(DAG, ContainerVT, Index);
5908 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5909 : DAG.getConstant(0, DL, ContainerVT);
5910
5911 // Emit equivalent scalable vector gather.
5912 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5913 SDValue Load =
5914 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5915 Ops, MGT->getMemOperand(), IndexType, ExtType);
5916
5917 // Extract fixed length data then convert to the required result type.
5918 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5919 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5920 if (VT.isFloatingPoint())
5921 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5922
5923 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5924 }
5925
5926 // Everything else is legal.
5927 return Op;
5928}
5929
5930SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5931 SelectionDAG &DAG) const {
5932 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5933
5934 SDLoc DL(Op);
5935 SDValue Chain = MSC->getChain();
5936 SDValue StoreVal = MSC->getValue();
5937 SDValue Mask = MSC->getMask();
5938 SDValue BasePtr = MSC->getBasePtr();
5939 SDValue Index = MSC->getIndex();
5940 SDValue Scale = MSC->getScale();
5941 EVT VT = StoreVal.getValueType();
5942 EVT MemVT = MSC->getMemoryVT();
5943 ISD::MemIndexType IndexType = MSC->getIndexType();
5944 bool Truncating = MSC->isTruncatingStore();
5945
5946 bool IsScaled = MSC->isIndexScaled();
5947 bool IsSigned = MSC->isIndexSigned();
5948
5949 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5950 // must be calculated before hand.
5951 uint64_t ScaleVal = Scale->getAsZExtVal();
5952 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5953 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5954 EVT IndexVT = Index.getValueType();
5955 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5956 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5957 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5958
5959 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5960 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5961 MSC->getMemOperand(), IndexType, Truncating);
5962 }
5963
5964 // Lower fixed length scatter to a scalable equivalent.
5965 if (VT.isFixedLengthVector()) {
5966 assert(Subtarget->useSVEForFixedLengthVectors() &&
5967 "Cannot lower when not using SVE for fixed vectors!");
5968
5969 // Once bitcast we treat floating-point scatters as if integer.
5970 if (VT.isFloatingPoint()) {
5972 MemVT = MemVT.changeVectorElementTypeToInteger();
5973 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5974 }
5975
5976 // Find the smallest integer fixed length vector we can use for the scatter.
5977 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5978 if (VT.getVectorElementType() == MVT::i64 ||
5979 Index.getValueType().getVectorElementType() == MVT::i64 ||
5980 Mask.getValueType().getVectorElementType() == MVT::i64)
5981 PromotedVT = VT.changeVectorElementType(MVT::i64);
5982
5983 // Promote vector operands.
5984 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5985 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5986 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5987 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5988
5989 // A promoted value type forces the need for a truncating store.
5990 if (PromotedVT != VT)
5991 Truncating = true;
5992
5993 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5994
5995 // Convert fixed length vector operands to scalable.
5996 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5997 Index = convertToScalableVector(DAG, ContainerVT, Index);
5999 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6000
6001 // Emit equivalent scalable vector scatter.
6002 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6003 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6004 MSC->getMemOperand(), IndexType, Truncating);
6005 }
6006
6007 // Everything else is legal.
6008 return Op;
6009}
6010
6011SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6012 SDLoc DL(Op);
6013 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6014 assert(LoadNode && "Expected custom lowering of a masked load node");
6015 EVT VT = Op->getValueType(0);
6016
6017 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6018 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6019
6020 SDValue PassThru = LoadNode->getPassThru();
6021 SDValue Mask = LoadNode->getMask();
6022
6023 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6024 return Op;
6025
6027 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6028 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6029 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6030 LoadNode->getExtensionType());
6031
6032 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6033
6034 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6035}
6036
6037// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6039 EVT VT, EVT MemVT,
6040 SelectionDAG &DAG) {
6041 assert(VT.isVector() && "VT should be a vector type");
6042 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6043
6044 SDValue Value = ST->getValue();
6045
6046 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6047 // the word lane which represent the v4i8 subvector. It optimizes the store
6048 // to:
6049 //
6050 // xtn v0.8b, v0.8h
6051 // str s0, [x0]
6052
6053 SDValue Undef = DAG.getUNDEF(MVT::i16);
6054 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6055 {Undef, Undef, Undef, Undef});
6056
6057 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6058 Value, UndefVec);
6059 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6060
6061 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6062 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6063 Trunc, DAG.getConstant(0, DL, MVT::i64));
6064
6065 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6066 ST->getBasePtr(), ST->getMemOperand());
6067}
6068
6069// Custom lowering for any store, vector or scalar and/or default or with
6070// a truncate operations. Currently only custom lower truncate operation
6071// from vector v4i16 to v4i8 or volatile stores of i128.
6072SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6073 SelectionDAG &DAG) const {
6074 SDLoc Dl(Op);
6075 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6076 assert (StoreNode && "Can only custom lower store nodes");
6077
6078 SDValue Value = StoreNode->getValue();
6079
6080 EVT VT = Value.getValueType();
6081 EVT MemVT = StoreNode->getMemoryVT();
6082
6083 if (VT.isVector()) {
6085 VT,
6086 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6087 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6088
6089 unsigned AS = StoreNode->getAddressSpace();
6090 Align Alignment = StoreNode->getAlign();
6091 if (Alignment < MemVT.getStoreSize() &&
6092 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6093 StoreNode->getMemOperand()->getFlags(),
6094 nullptr)) {
6095 return scalarizeVectorStore(StoreNode, DAG);
6096 }
6097
6098 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6099 MemVT == MVT::v4i8) {
6100 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6101 }
6102 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6103 // the custom lowering, as there are no un-paired non-temporal stores and
6104 // legalization will break up 256 bit inputs.
6106 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6107 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6108 (MemVT.getScalarSizeInBits() == 8u ||
6109 MemVT.getScalarSizeInBits() == 16u ||
6110 MemVT.getScalarSizeInBits() == 32u ||
6111 MemVT.getScalarSizeInBits() == 64u)) {
6112 SDValue Lo =
6115 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6116 SDValue Hi =
6119 StoreNode->getValue(),
6120 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6122 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6123 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6124 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6125 return Result;
6126 }
6127 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6128 return LowerStore128(Op, DAG);
6129 } else if (MemVT == MVT::i64x8) {
6130 SDValue Value = StoreNode->getValue();
6131 assert(Value->getValueType(0) == MVT::i64x8);
6132 SDValue Chain = StoreNode->getChain();
6133 SDValue Base = StoreNode->getBasePtr();
6134 EVT PtrVT = Base.getValueType();
6135 for (unsigned i = 0; i < 8; i++) {
6136 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6137 Value, DAG.getConstant(i, Dl, MVT::i32));
6138 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6139 DAG.getConstant(i * 8, Dl, PtrVT));
6140 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6141 StoreNode->getOriginalAlign());
6142 }
6143 return Chain;
6144 }
6145
6146 return SDValue();
6147}
6148
6149/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6150SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6151 SelectionDAG &DAG) const {
6152 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6153 assert(StoreNode->getMemoryVT() == MVT::i128);
6154 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6155
6156 bool IsStoreRelease =
6158 if (StoreNode->isAtomic())
6159 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6160 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6163
6164 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6165 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6166 ? StoreNode->getOperand(1)
6167 : StoreNode->getOperand(2);
6168 SDLoc DL(Op);
6169 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6170 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6171 if (DAG.getDataLayout().isBigEndian())
6172 std::swap(StoreValue.first, StoreValue.second);
6174 Opcode, DL, DAG.getVTList(MVT::Other),
6175 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6176 StoreNode->getBasePtr()},
6177 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6178 return Result;
6179}
6180
6181SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6182 SelectionDAG &DAG) const {
6183 SDLoc DL(Op);
6184 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6185 assert(LoadNode && "Expected custom lowering of a load node");
6186
6187 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6189 SDValue Base = LoadNode->getBasePtr();
6190 SDValue Chain = LoadNode->getChain();
6191 EVT PtrVT = Base.getValueType();
6192 for (unsigned i = 0; i < 8; i++) {
6193 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6194 DAG.getConstant(i * 8, DL, PtrVT));
6195 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6196 LoadNode->getPointerInfo(),
6197 LoadNode->getOriginalAlign());
6198 Ops.push_back(Part);
6199 Chain = SDValue(Part.getNode(), 1);
6200 }
6201 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6202 return DAG.getMergeValues({Loaded, Chain}, DL);
6203 }
6204
6205 // Custom lowering for extending v4i8 vector loads.
6206 EVT VT = Op->getValueType(0);
6207 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6208
6209 if (LoadNode->getMemoryVT() != MVT::v4i8)
6210 return SDValue();
6211
6212 unsigned ExtType;
6213 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6214 ExtType = ISD::SIGN_EXTEND;
6215 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6216 LoadNode->getExtensionType() == ISD::EXTLOAD)
6217 ExtType = ISD::ZERO_EXTEND;
6218 else
6219 return SDValue();
6220
6221 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6222 LoadNode->getBasePtr(), MachinePointerInfo());
6223 SDValue Chain = Load.getValue(1);
6224 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6225 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6226 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6227 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6228 DAG.getConstant(0, DL, MVT::i64));
6229 if (VT == MVT::v4i32)
6230 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6231 return DAG.getMergeValues({Ext, Chain}, DL);
6232}
6233
6234// Generate SUBS and CSEL for integer abs.
6235SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6236 MVT VT = Op.getSimpleValueType();
6237
6238 if (VT.isVector())
6239 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6240
6241 SDLoc DL(Op);
6242 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6243 Op.getOperand(0));
6244 // Generate SUBS & CSEL.
6245 SDValue Cmp =
6246 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6247 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6248 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6249 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6250 Cmp.getValue(1));
6251}
6252
6254 SDValue Chain = Op.getOperand(0);
6255 SDValue Cond = Op.getOperand(1);
6256 SDValue Dest = Op.getOperand(2);
6257
6259 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6260 SDLoc dl(Op);
6261 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6262 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6263 Cmp);
6264 }
6265
6266 return SDValue();
6267}
6268
6269// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6270// FSHL is converted to FSHR before deciding what to do with it
6272 SDValue Shifts = Op.getOperand(2);
6273 // Check if the shift amount is a constant
6274 // If opcode is FSHL, convert it to FSHR
6275 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6276 SDLoc DL(Op);
6277 MVT VT = Op.getSimpleValueType();
6278
6279 if (Op.getOpcode() == ISD::FSHL) {
6280 unsigned int NewShiftNo =
6281 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6282 return DAG.getNode(
6283 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6284 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6285 } else if (Op.getOpcode() == ISD::FSHR) {
6286 return Op;
6287 }
6288 }
6289
6290 return SDValue();
6291}
6292
6294 SDValue X = Op.getOperand(0);
6295 EVT XScalarTy = X.getValueType();
6296 SDValue Exp = Op.getOperand(1);
6297
6298 SDLoc DL(Op);
6299 EVT XVT, ExpVT;
6300 switch (Op.getSimpleValueType().SimpleTy) {
6301 default:
6302 return SDValue();
6303 case MVT::bf16:
6304 case MVT::f16:
6305 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6306 [[fallthrough]];
6307 case MVT::f32:
6308 XVT = MVT::nxv4f32;
6309 ExpVT = MVT::nxv4i32;
6310 break;
6311 case MVT::f64:
6312 XVT = MVT::nxv2f64;
6313 ExpVT = MVT::nxv2i64;
6314 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6315 break;
6316 }
6317
6318 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6319 SDValue VX =
6320 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6321 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6322 DAG.getUNDEF(ExpVT), Exp, Zero);
6323 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6324 AArch64SVEPredPattern::all);
6325 SDValue FScale =
6327 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6328 VPg, VX, VExp);
6329 SDValue Final =
6330 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6331 if (X.getValueType() != XScalarTy)
6332 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6333 DAG.getIntPtrConstant(1, SDLoc(Op)));
6334 return Final;
6335}
6336
6338 SelectionDAG &DAG) const {
6339 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6340 LLVM_DEBUG(Op.dump());
6341
6342 switch (Op.getOpcode()) {
6343 default:
6344 llvm_unreachable("unimplemented operand");
6345 return SDValue();
6346 case ISD::BITCAST:
6347 return LowerBITCAST(Op, DAG);
6348 case ISD::GlobalAddress:
6349 return LowerGlobalAddress(Op, DAG);
6351 return LowerGlobalTLSAddress(Op, DAG);
6352 case ISD::SETCC:
6353 case ISD::STRICT_FSETCC:
6355 return LowerSETCC(Op, DAG);
6356 case ISD::SETCCCARRY:
6357 return LowerSETCCCARRY(Op, DAG);
6358 case ISD::BRCOND:
6359 return LowerBRCOND(Op, DAG);
6360 case ISD::BR_CC:
6361 return LowerBR_CC(Op, DAG);
6362 case ISD::SELECT:
6363 return LowerSELECT(Op, DAG);
6364 case ISD::SELECT_CC:
6365 return LowerSELECT_CC(Op, DAG);
6366 case ISD::JumpTable:
6367 return LowerJumpTable(Op, DAG);
6368 case ISD::BR_JT:
6369 return LowerBR_JT(Op, DAG);
6370 case ISD::ConstantPool:
6371 return LowerConstantPool(Op, DAG);
6372 case ISD::BlockAddress:
6373 return LowerBlockAddress(Op, DAG);
6374 case ISD::VASTART:
6375 return LowerVASTART(Op, DAG);
6376 case ISD::VACOPY:
6377 return LowerVACOPY(Op, DAG);
6378 case ISD::VAARG:
6379 return LowerVAARG(Op, DAG);
6380 case ISD::UADDO_CARRY:
6381 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6382 case ISD::USUBO_CARRY:
6383 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6384 case ISD::SADDO_CARRY:
6385 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6386 case ISD::SSUBO_CARRY:
6387 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6388 case ISD::SADDO:
6389 case ISD::UADDO:
6390 case ISD::SSUBO:
6391 case ISD::USUBO:
6392 case ISD::SMULO:
6393 case ISD::UMULO:
6394 return LowerXALUO(Op, DAG);
6395 case ISD::FADD:
6396 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6397 case ISD::FSUB:
6398 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6399 case ISD::FMUL:
6400 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6401 case ISD::FMA:
6402 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6403 case ISD::FDIV:
6404 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6405 case ISD::FNEG:
6406 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6407 case ISD::FCEIL:
6408 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6409 case ISD::FFLOOR:
6410 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6411 case ISD::FNEARBYINT:
6412 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6413 case ISD::FRINT:
6414 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6415 case ISD::FROUND:
6416 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6417 case ISD::FROUNDEVEN:
6418 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6419 case ISD::FTRUNC:
6420 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6421 case ISD::FSQRT:
6422 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6423 case ISD::FABS:
6424 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6425 case ISD::FP_ROUND:
6427 return LowerFP_ROUND(Op, DAG);
6428 case ISD::FP_EXTEND:
6429 return LowerFP_EXTEND(Op, DAG);
6430 case ISD::FRAMEADDR:
6431 return LowerFRAMEADDR(Op, DAG);
6432 case ISD::SPONENTRY:
6433 return LowerSPONENTRY(Op, DAG);
6434 case ISD::RETURNADDR:
6435 return LowerRETURNADDR(Op, DAG);
6437 return LowerADDROFRETURNADDR(Op, DAG);
6439 return LowerCONCAT_VECTORS(Op, DAG);
6441 return LowerINSERT_VECTOR_ELT(Op, DAG);
6443 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6444 case ISD::BUILD_VECTOR:
6445 return LowerBUILD_VECTOR(Op, DAG);
6447 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6449 return LowerVECTOR_SHUFFLE(Op, DAG);
6450 case ISD::SPLAT_VECTOR:
6451 return LowerSPLAT_VECTOR(Op, DAG);
6453 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6455 return LowerINSERT_SUBVECTOR(Op, DAG);
6456 case ISD::SDIV:
6457 case ISD::UDIV:
6458 return LowerDIV(Op, DAG);
6459 case ISD::SMIN:
6460 case ISD::UMIN:
6461 case ISD::SMAX:
6462 case ISD::UMAX:
6463 return LowerMinMax(Op, DAG);
6464 case ISD::SRA:
6465 case ISD::SRL:
6466 case ISD::SHL:
6467 return LowerVectorSRA_SRL_SHL(Op, DAG);
6468 case ISD::SHL_PARTS:
6469 case ISD::SRL_PARTS:
6470 case ISD::SRA_PARTS:
6471 return LowerShiftParts(Op, DAG);
6472 case ISD::CTPOP:
6473 case ISD::PARITY:
6474 return LowerCTPOP_PARITY(Op, DAG);
6475 case ISD::FCOPYSIGN:
6476 return LowerFCOPYSIGN(Op, DAG);
6477 case ISD::OR:
6478 return LowerVectorOR(Op, DAG);
6479 case ISD::XOR:
6480 return LowerXOR(Op, DAG);
6481 case ISD::PREFETCH:
6482 return LowerPREFETCH(Op, DAG);
6483 case ISD::SINT_TO_FP:
6484 case ISD::UINT_TO_FP:
6487 return LowerINT_TO_FP(Op, DAG);
6488 case ISD::FP_TO_SINT:
6489 case ISD::FP_TO_UINT:
6492 return LowerFP_TO_INT(Op, DAG);
6495 return LowerFP_TO_INT_SAT(Op, DAG);
6496 case ISD::FSINCOS:
6497 return LowerFSINCOS(Op, DAG);
6498 case ISD::GET_ROUNDING:
6499 return LowerGET_ROUNDING(Op, DAG);
6500 case ISD::SET_ROUNDING:
6501 return LowerSET_ROUNDING(Op, DAG);
6502 case ISD::MUL:
6503 return LowerMUL(Op, DAG);
6504 case ISD::MULHS:
6505 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6506 case ISD::MULHU:
6507 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6509 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6511 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6513 return LowerINTRINSIC_VOID(Op, DAG);
6514 case ISD::ATOMIC_STORE:
6515 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6516 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6517 return LowerStore128(Op, DAG);
6518 }
6519 return SDValue();
6520 case ISD::STORE:
6521 return LowerSTORE(Op, DAG);
6522 case ISD::MSTORE:
6523 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6524 case ISD::MGATHER:
6525 return LowerMGATHER(Op, DAG);
6526 case ISD::MSCATTER:
6527 return LowerMSCATTER(Op, DAG);
6529 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6530 case ISD::VECREDUCE_ADD:
6531 case ISD::VECREDUCE_AND:
6532 case ISD::VECREDUCE_OR:
6533 case ISD::VECREDUCE_XOR:
6543 return LowerVECREDUCE(Op, DAG);
6545 return LowerATOMIC_LOAD_AND(Op, DAG);
6547 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6548 case ISD::VSCALE:
6549 return LowerVSCALE(Op, DAG);
6550 case ISD::ANY_EXTEND:
6551 case ISD::SIGN_EXTEND:
6552 case ISD::ZERO_EXTEND:
6553 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6555 // Only custom lower when ExtraVT has a legal byte based element type.
6556 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6557 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6558 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6559 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6560 return SDValue();
6561
6562 return LowerToPredicatedOp(Op, DAG,
6564 }
6565 case ISD::TRUNCATE:
6566 return LowerTRUNCATE(Op, DAG);
6567 case ISD::MLOAD:
6568 return LowerMLOAD(Op, DAG);
6569 case ISD::LOAD:
6570 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6571 !Subtarget->isNeonAvailable()))
6572 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6573 return LowerLOAD(Op, DAG);
6574 case ISD::ADD:
6575 case ISD::AND:
6576 case ISD::SUB:
6577 return LowerToScalableOp(Op, DAG);
6578 case ISD::FMAXIMUM:
6579 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6580 case ISD::FMAXNUM:
6581 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6582 case ISD::FMINIMUM:
6583 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6584 case ISD::FMINNUM:
6585 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6586 case ISD::VSELECT:
6587 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6588 case ISD::ABS:
6589 return LowerABS(Op, DAG);
6590 case ISD::ABDS:
6591 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6592 case ISD::ABDU:
6593 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6594 case ISD::AVGFLOORS:
6595 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6596 case ISD::AVGFLOORU:
6597 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6598 case ISD::AVGCEILS:
6599 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6600 case ISD::AVGCEILU:
6601 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6602 case ISD::BITREVERSE:
6603 return LowerBitreverse(Op, DAG);
6604 case ISD::BSWAP:
6605 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6606 case ISD::CTLZ:
6607 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6608 case ISD::CTTZ:
6609 return LowerCTTZ(Op, DAG);
6610 case ISD::VECTOR_SPLICE:
6611 return LowerVECTOR_SPLICE(Op, DAG);
6613 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6615 return LowerVECTOR_INTERLEAVE(Op, DAG);
6616 case ISD::LROUND:
6617 case ISD::LLROUND:
6618 case ISD::LRINT:
6619 case ISD::LLRINT: {
6620 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6621 Op.getOperand(0).getValueType() == MVT::bf16) &&
6622 "Expected custom lowering of rounding operations only for f16");
6623 SDLoc DL(Op);
6624 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6625 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6626 }
6627 case ISD::STRICT_LROUND:
6629 case ISD::STRICT_LRINT:
6630 case ISD::STRICT_LLRINT: {
6631 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6632 Op.getOperand(1).getValueType() == MVT::bf16) &&
6633 "Expected custom lowering of rounding operations only for f16");
6634 SDLoc DL(Op);
6635 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6636 {Op.getOperand(0), Op.getOperand(1)});
6637 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6638 {Ext.getValue(1), Ext.getValue(0)});
6639 }
6640 case ISD::WRITE_REGISTER: {
6641 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6642 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6643 SDLoc DL(Op);
6644
6645 SDValue Chain = Op.getOperand(0);
6646 SDValue SysRegName = Op.getOperand(1);
6647 std::pair<SDValue, SDValue> Pair =
6648 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6649
6650 // chain = MSRR(chain, sysregname, lo, hi)
6651 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6652 SysRegName, Pair.first, Pair.second);
6653
6654 return Result;
6655 }
6656 case ISD::FSHL:
6657 case ISD::FSHR:
6658 return LowerFunnelShift(Op, DAG);
6659 case ISD::FLDEXP:
6660 return LowerFLDEXP(Op, DAG);
6661 }
6662}
6663
6665 return !Subtarget->useSVEForFixedLengthVectors();
6666}
6667
6669 EVT VT, bool OverrideNEON) const {
6670 if (!VT.isFixedLengthVector() || !VT.isSimple())
6671 return false;
6672
6673 // Don't use SVE for vectors we cannot scalarize if required.
6674 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6675 // Fixed length predicates should be promoted to i8.
6676 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6677 case MVT::i1:
6678 default:
6679 return false;
6680 case MVT::i8:
6681 case MVT::i16:
6682 case MVT::i32:
6683 case MVT::i64:
6684 case MVT::f16:
6685 case MVT::f32:
6686 case MVT::f64:
6687 break;
6688 }
6689
6690 // NEON-sized vectors can be emulated using SVE instructions.
6691 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6692 return Subtarget->hasSVEorSME();
6693
6694 // Ensure NEON MVTs only belong to a single register class.
6695 if (VT.getFixedSizeInBits() <= 128)
6696 return false;
6697
6698 // Ensure wider than NEON code generation is enabled.
6699 if (!Subtarget->useSVEForFixedLengthVectors())
6700 return false;
6701
6702 // Don't use SVE for types that don't fit.
6703 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6704 return false;
6705
6706 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6707 // the base fixed length SVE support in place.
6708 if (!VT.isPow2VectorType())
6709 return false;
6710
6711 return true;
6712}
6713
6714//===----------------------------------------------------------------------===//
6715// Calling Convention Implementation
6716//===----------------------------------------------------------------------===//
6717
6718static unsigned getIntrinsicID(const SDNode *N) {
6719 unsigned Opcode = N->getOpcode();
6720 switch (Opcode) {
6721 default:
6724 unsigned IID = N->getConstantOperandVal(0);
6725 if (IID < Intrinsic::num_intrinsics)
6726 return IID;
6728 }
6729 }
6730}
6731
6733 SDValue N1) const {
6734 if (!N0.hasOneUse())
6735 return false;
6736
6737 unsigned IID = getIntrinsicID(N1.getNode());
6738 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6739 if (IID == Intrinsic::aarch64_neon_umull ||
6740 N1.getOpcode() == AArch64ISD::UMULL ||
6741 IID == Intrinsic::aarch64_neon_smull ||
6743 return N0.getOpcode() != ISD::ADD;
6744
6745 return true;
6746}
6747
6748/// Selects the correct CCAssignFn for a given CallingConvention value.
6750 bool IsVarArg) const {
6751 switch (CC) {
6752 default:
6753 report_fatal_error("Unsupported calling convention.");
6754 case CallingConv::GHC:
6755 return CC_AArch64_GHC;
6756 case CallingConv::C:
6757 case CallingConv::Fast:
6761 case CallingConv::Swift:
6763 case CallingConv::Tail:
6764 case CallingConv::GRAAL:
6765 if (Subtarget->isTargetWindows()) {
6766 if (IsVarArg) {
6767 if (Subtarget->isWindowsArm64EC())
6770 }
6771 return CC_AArch64_Win64PCS;
6772 }
6773 if (!Subtarget->isTargetDarwin())
6774 return CC_AArch64_AAPCS;
6775 if (!IsVarArg)
6776 return CC_AArch64_DarwinPCS;
6779 case CallingConv::Win64:
6780 if (IsVarArg) {
6781 if (Subtarget->isWindowsArm64EC())
6784 }
6785 return CC_AArch64_Win64PCS;
6787 if (Subtarget->isWindowsArm64EC())
6794 return CC_AArch64_AAPCS;
6799 }
6800}
6801
6802CCAssignFn *
6804 switch (CC) {
6805 default:
6806 return RetCC_AArch64_AAPCS;
6810 if (Subtarget->isWindowsArm64EC())
6812 return RetCC_AArch64_AAPCS;
6813 }
6814}
6815
6816
6817unsigned
6818AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6819 SelectionDAG &DAG) const {
6821 MachineFrameInfo &MFI = MF.getFrameInfo();
6822
6823 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6824 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6825 DAG.getConstant(1, DL, MVT::i32));
6826 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6827 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6828 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6829 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6830 Chain = Buffer.getValue(1);
6831 MFI.CreateVariableSizedObject(Align(1), nullptr);
6832
6833 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6834 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6835
6836 // Store the buffer pointer to the TPIDR2 stack object.
6839 TPIDR2Obj,
6841 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6842
6843 // Set the reserved bytes (10-15) to zero
6844 EVT PtrTy = Ptr.getValueType();
6845 SDValue ReservedPtr =
6846 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6847 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6848 MPI);
6849 ReservedPtr =
6850 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6851 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6852 MPI);
6853
6854 return TPIDR2Obj;
6855}
6856
6857static bool isPassedInFPR(EVT VT) {
6858 return VT.isFixedLengthVector() ||
6859 (VT.isFloatingPoint() && !VT.isScalableVector());
6860}
6861
6862SDValue AArch64TargetLowering::LowerFormalArguments(
6863 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6864 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6865 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6867 const Function &F = MF.getFunction();
6868 MachineFrameInfo &MFI = MF.getFrameInfo();
6869 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6870 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6871 (isVarArg && Subtarget->isWindowsArm64EC());
6873
6875 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6877 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6878 FuncInfo->setIsSVECC(true);
6879
6880 // Assign locations to all of the incoming arguments.
6882 DenseMap<unsigned, SDValue> CopiedRegs;
6883 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6884
6885 // At this point, Ins[].VT may already be promoted to i32. To correctly
6886 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6887 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6888 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6889 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6890 // LocVT.
6891 unsigned NumArgs = Ins.size();
6892 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6893 unsigned CurArgIdx = 0;
6894 for (unsigned i = 0; i != NumArgs; ++i) {
6895 MVT ValVT = Ins[i].VT;
6896 if (Ins[i].isOrigArg()) {
6897 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6898 CurArgIdx = Ins[i].getOrigArgIndex();
6899
6900 // Get type of the original argument.
6901 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6902 /*AllowUnknown*/ true);
6903 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6904 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6905 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6906 ValVT = MVT::i8;
6907 else if (ActualMVT == MVT::i16)
6908 ValVT = MVT::i16;
6909 }
6910 bool UseVarArgCC = false;
6911 if (IsWin64)
6912 UseVarArgCC = isVarArg;
6913 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6914 bool Res =
6915 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6916 assert(!Res && "Call operand has unhandled type");
6917 (void)Res;
6918 }
6919
6921 bool IsLocallyStreaming =
6922 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6923 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6924 SDValue Glue = Chain.getValue(1);
6925
6926 SmallVector<SDValue, 16> ArgValues;
6927 unsigned ExtraArgLocs = 0;
6928 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6929 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6930
6931 if (Ins[i].Flags.isByVal()) {
6932 // Byval is used for HFAs in the PCS, but the system should work in a
6933 // non-compliant manner for larger structs.
6934 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6935 int Size = Ins[i].Flags.getByValSize();
6936 unsigned NumRegs = (Size + 7) / 8;
6937
6938 // FIXME: This works on big-endian for composite byvals, which are the common
6939 // case. It should also work for fundamental types too.
6940 unsigned FrameIdx =
6941 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6942 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6943 InVals.push_back(FrameIdxN);
6944
6945 continue;
6946 }
6947
6948 if (Ins[i].Flags.isSwiftAsync())
6950
6951 SDValue ArgValue;
6952 if (VA.isRegLoc()) {
6953 // Arguments stored in registers.
6954 EVT RegVT = VA.getLocVT();
6955 const TargetRegisterClass *RC;
6956
6957 if (RegVT == MVT::i32)
6958 RC = &AArch64::GPR32RegClass;
6959 else if (RegVT == MVT::i64)
6960 RC = &AArch64::GPR64RegClass;
6961 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6962 RC = &AArch64::FPR16RegClass;
6963 else if (RegVT == MVT::f32)
6964 RC = &AArch64::FPR32RegClass;
6965 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6966 RC = &AArch64::FPR64RegClass;
6967 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6968 RC = &AArch64::FPR128RegClass;
6969 else if (RegVT.isScalableVector() &&
6970 RegVT.getVectorElementType() == MVT::i1) {
6971 FuncInfo->setIsSVECC(true);
6972 RC = &AArch64::PPRRegClass;
6973 } else if (RegVT == MVT::aarch64svcount) {
6974 FuncInfo->setIsSVECC(true);
6975 RC = &AArch64::PPRRegClass;
6976 } else if (RegVT.isScalableVector()) {
6977 FuncInfo->setIsSVECC(true);
6978 RC = &AArch64::ZPRRegClass;
6979 } else
6980 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6981
6982 // Transform the arguments in physical registers into virtual ones.
6983 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
6984
6985 if (IsLocallyStreaming) {
6986 // LocallyStreamingFunctions must insert the SMSTART in the correct
6987 // position, so we use Glue to ensure no instructions can be scheduled
6988 // between the chain of:
6989 // t0: ch,glue = EntryNode
6990 // t1: res,ch,glue = CopyFromReg
6991 // ...
6992 // tn: res,ch,glue = CopyFromReg t(n-1), ..
6993 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6994 // ^^^^^^
6995 // This will be the new Chain/Root node.
6996 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
6997 Glue = ArgValue.getValue(2);
6998 if (isPassedInFPR(ArgValue.getValueType())) {
6999 ArgValue =
7001 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7002 {ArgValue, Glue});
7003 Glue = ArgValue.getValue(1);
7004 }
7005 } else
7006 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7007
7008 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7009 // to 64 bits. Insert an assert[sz]ext to capture this, then
7010 // truncate to the right size.
7011 switch (VA.getLocInfo()) {
7012 default:
7013 llvm_unreachable("Unknown loc info!");
7014 case CCValAssign::Full:
7015 break;
7017 assert(
7018 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7019 "Indirect arguments should be scalable on most subtargets");
7020 break;
7021 case CCValAssign::BCvt:
7022 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7023 break;
7024 case CCValAssign::AExt:
7025 case CCValAssign::SExt:
7026 case CCValAssign::ZExt:
7027 break;
7029 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7030 DAG.getConstant(32, DL, RegVT));
7031 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7032 break;
7033 }
7034 } else { // VA.isRegLoc()
7035 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7036 unsigned ArgOffset = VA.getLocMemOffset();
7037 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7038 ? VA.getLocVT().getSizeInBits()
7039 : VA.getValVT().getSizeInBits()) / 8;
7040
7041 uint32_t BEAlign = 0;
7042 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7043 !Ins[i].Flags.isInConsecutiveRegs())
7044 BEAlign = 8 - ArgSize;
7045
7046 SDValue FIN;
7047 MachinePointerInfo PtrInfo;
7048 if (StackViaX4) {
7049 // In both the ARM64EC varargs convention and the thunk convention,
7050 // arguments on the stack are accessed relative to x4, not sp. In
7051 // the thunk convention, there's an additional offset of 32 bytes
7052 // to account for the shadow store.
7053 unsigned ObjOffset = ArgOffset + BEAlign;
7054 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7055 ObjOffset += 32;
7056 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7057 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7058 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7059 DAG.getConstant(ObjOffset, DL, MVT::i64));
7061 } else {
7062 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7063
7064 // Create load nodes to retrieve arguments from the stack.
7065 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7066 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7067 }
7068
7069 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7071 MVT MemVT = VA.getValVT();
7072
7073 switch (VA.getLocInfo()) {
7074 default:
7075 break;
7076 case CCValAssign::Trunc:
7077 case CCValAssign::BCvt:
7078 MemVT = VA.getLocVT();
7079 break;
7082 Subtarget->isWindowsArm64EC()) &&
7083 "Indirect arguments should be scalable on most subtargets");
7084 MemVT = VA.getLocVT();
7085 break;
7086 case CCValAssign::SExt:
7087 ExtType = ISD::SEXTLOAD;
7088 break;
7089 case CCValAssign::ZExt:
7090 ExtType = ISD::ZEXTLOAD;
7091 break;
7092 case CCValAssign::AExt:
7093 ExtType = ISD::EXTLOAD;
7094 break;
7095 }
7096
7097 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7098 MemVT);
7099 }
7100
7101 if (VA.getLocInfo() == CCValAssign::Indirect) {
7102 assert((VA.getValVT().isScalableVT() ||
7103 Subtarget->isWindowsArm64EC()) &&
7104 "Indirect arguments should be scalable on most subtargets");
7105
7106 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7107 unsigned NumParts = 1;
7108 if (Ins[i].Flags.isInConsecutiveRegs()) {
7109 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7110 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7111 ++NumParts;
7112 }
7113
7114 MVT PartLoad = VA.getValVT();
7115 SDValue Ptr = ArgValue;
7116
7117 // Ensure we generate all loads for each tuple part, whilst updating the
7118 // pointer after each load correctly using vscale.
7119 while (NumParts > 0) {
7120 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7121 InVals.push_back(ArgValue);
7122 NumParts--;
7123 if (NumParts > 0) {
7124 SDValue BytesIncrement;
7125 if (PartLoad.isScalableVector()) {
7126 BytesIncrement = DAG.getVScale(
7127 DL, Ptr.getValueType(),
7128 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7129 } else {
7130 BytesIncrement = DAG.getConstant(
7131 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7132 Ptr.getValueType());
7133 }
7135 Flags.setNoUnsignedWrap(true);
7136 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7137 BytesIncrement, Flags);
7138 ExtraArgLocs++;
7139 i++;
7140 }
7141 }
7142 } else {
7143 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7144 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7145 ArgValue, DAG.getValueType(MVT::i32));
7146
7147 // i1 arguments are zero-extended to i8 by the caller. Emit a
7148 // hint to reflect this.
7149 if (Ins[i].isOrigArg()) {
7150 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7151 if (OrigArg->getType()->isIntegerTy(1)) {
7152 if (!Ins[i].Flags.isZExt()) {
7153 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7154 ArgValue.getValueType(), ArgValue);
7155 }
7156 }
7157 }
7158
7159 InVals.push_back(ArgValue);
7160 }
7161 }
7162 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7163
7164 // Insert the SMSTART if this is a locally streaming function and
7165 // make sure it is Glued to the last CopyFromReg value.
7166 if (IsLocallyStreaming) {
7167 SDValue PStateSM;
7168 if (Attrs.hasStreamingCompatibleInterface()) {
7169 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7172 FuncInfo->setPStateSMReg(Reg);
7173 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7174 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7176 } else
7177 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7179
7180 // Ensure that the SMSTART happens after the CopyWithChain such that its
7181 // chain result is used.
7182 for (unsigned I=0; I<InVals.size(); ++I) {
7184 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7185 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7186 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7187 InVals[I].getValueType());
7188 }
7189 }
7190
7191 // varargs
7192 if (isVarArg) {
7193 if (!Subtarget->isTargetDarwin() || IsWin64) {
7194 // The AAPCS variadic function ABI is identical to the non-variadic
7195 // one. As a result there may be more arguments in registers and we should
7196 // save them for future reference.
7197 // Win64 variadic functions also pass arguments in registers, but all float
7198 // arguments are passed in integer registers.
7199 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7200 }
7201
7202 // This will point to the next argument passed via stack.
7203 unsigned VarArgsOffset = CCInfo.getStackSize();
7204 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7205 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7206 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7207 FuncInfo->setVarArgsStackIndex(
7208 MFI.CreateFixedObject(4, VarArgsOffset, true));
7209
7210 if (MFI.hasMustTailInVarArgFunc()) {
7211 SmallVector<MVT, 2> RegParmTypes;
7212 RegParmTypes.push_back(MVT::i64);
7213 RegParmTypes.push_back(MVT::f128);
7214 // Compute the set of forwarded registers. The rest are scratch.
7216 FuncInfo->getForwardedMustTailRegParms();
7217 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7219
7220 // Conservatively forward X8, since it might be used for aggregate return.
7221 if (!CCInfo.isAllocated(AArch64::X8)) {
7222 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7223 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7224 }
7225 }
7226 }
7227
7228 // On Windows, InReg pointers must be returned, so record the pointer in a
7229 // virtual register at the start of the function so it can be returned in the
7230 // epilogue.
7231 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7232 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7233 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7234 Ins[I].Flags.isInReg()) &&
7235 Ins[I].Flags.isSRet()) {
7236 assert(!FuncInfo->getSRetReturnReg());
7237
7238 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7239 Register Reg =
7241 FuncInfo->setSRetReturnReg(Reg);
7242
7243 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7244 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7245 break;
7246 }
7247 }
7248 }
7249
7250 unsigned StackArgSize = CCInfo.getStackSize();
7251 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7252 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7253 // This is a non-standard ABI so by fiat I say we're allowed to make full
7254 // use of the stack area to be popped, which must be aligned to 16 bytes in
7255 // any case:
7256 StackArgSize = alignTo(StackArgSize, 16);
7257
7258 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7259 // a multiple of 16.
7260 FuncInfo->setArgumentStackToRestore(StackArgSize);
7261
7262 // This realignment carries over to the available bytes below. Our own
7263 // callers will guarantee the space is free by giving an aligned value to
7264 // CALLSEQ_START.
7265 }
7266 // Even if we're not expected to free up the space, it's useful to know how
7267 // much is there while considering tail calls (because we can reuse it).
7268 FuncInfo->setBytesInStackArgArea(StackArgSize);
7269
7270 if (Subtarget->hasCustomCallingConv())
7272
7273 // Conservatively assume the function requires the lazy-save mechanism.
7274 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7275 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7276 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7277 }
7278
7279 return Chain;
7280}
7281
7282void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7283 SelectionDAG &DAG,
7284 const SDLoc &DL,
7285 SDValue &Chain) const {
7287 MachineFrameInfo &MFI = MF.getFrameInfo();
7289 auto PtrVT = getPointerTy(DAG.getDataLayout());
7290 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7291
7293
7295 unsigned NumGPRArgRegs = GPRArgRegs.size();
7296 if (Subtarget->isWindowsArm64EC()) {
7297 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7298 // functions.
7299 NumGPRArgRegs = 4;
7300 }
7301 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7302
7303 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7304 int GPRIdx = 0;
7305 if (GPRSaveSize != 0) {
7306 if (IsWin64) {
7307 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7308 if (GPRSaveSize & 15)
7309 // The extra size here, if triggered, will always be 8.
7310 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7311 } else
7312 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7313
7314 SDValue FIN;
7315 if (Subtarget->isWindowsArm64EC()) {
7316 // With the Arm64EC ABI, we reserve the save area as usual, but we
7317 // compute its address relative to x4. For a normal AArch64->AArch64
7318 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7319 // different address.
7320 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7321 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7322 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7323 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7324 } else {
7325 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7326 }
7327
7328 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7329 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7330 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7331 SDValue Store =
7332 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7334 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7335 : MachinePointerInfo::getStack(MF, i * 8));
7336 MemOps.push_back(Store);
7337 FIN =
7338 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7339 }
7340 }
7341 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7342 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7343
7344 if (Subtarget->hasFPARMv8() && !IsWin64) {
7346 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7347 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7348
7349 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7350 int FPRIdx = 0;
7351 if (FPRSaveSize != 0) {
7352 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7353
7354 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7355
7356 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7357 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7358 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7359
7360 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7361 MachinePointerInfo::getStack(MF, i * 16));
7362 MemOps.push_back(Store);
7363 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7364 DAG.getConstant(16, DL, PtrVT));
7365 }
7366 }
7367 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7368 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7369 }
7370
7371 if (!MemOps.empty()) {
7372 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7373 }
7374}
7375
7376/// LowerCallResult - Lower the result values of a call into the
7377/// appropriate copies out of appropriate physical registers.
7378SDValue AArch64TargetLowering::LowerCallResult(
7379 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7380 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7381 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7382 SDValue ThisVal, bool RequiresSMChange) const {
7383 DenseMap<unsigned, SDValue> CopiedRegs;
7384 // Copy all of the result registers out of their specified physreg.
7385 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7386 CCValAssign VA = RVLocs[i];
7387
7388 // Pass 'this' value directly from the argument to return value, to avoid
7389 // reg unit interference
7390 if (i == 0 && isThisReturn) {
7391 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7392 "unexpected return calling convention register assignment");
7393 InVals.push_back(ThisVal);
7394 continue;
7395 }
7396
7397 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7398 // allows one use of a physreg per block.
7399 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7400 if (!Val) {
7401 Val =
7402 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7403 Chain = Val.getValue(1);
7404 InGlue = Val.getValue(2);
7405 CopiedRegs[VA.getLocReg()] = Val;
7406 }
7407
7408 switch (VA.getLocInfo()) {
7409 default:
7410 llvm_unreachable("Unknown loc info!");
7411 case CCValAssign::Full:
7412 break;
7413 case CCValAssign::BCvt:
7414 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7415 break;
7417 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7418 DAG.getConstant(32, DL, VA.getLocVT()));
7419 [[fallthrough]];
7420 case CCValAssign::AExt:
7421 [[fallthrough]];
7422 case CCValAssign::ZExt:
7423 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7424 break;
7425 }
7426
7427 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7429 Val);
7430
7431 InVals.push_back(Val);
7432 }
7433
7434 return Chain;
7435}
7436
7437/// Return true if the calling convention is one that we can guarantee TCO for.
7438static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7439 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7441}
7442
7443/// Return true if we might ever do TCO for calls with this calling convention.
7445 switch (CC) {
7446 case CallingConv::C:
7450 case CallingConv::Swift:
7452 case CallingConv::Tail:
7453 case CallingConv::Fast:
7454 return true;
7455 default:
7456 return false;
7457 }
7458}
7459
7461 const AArch64Subtarget *Subtarget,
7463 CCState &CCInfo) {
7464 const SelectionDAG &DAG = CLI.DAG;
7465 CallingConv::ID CalleeCC = CLI.CallConv;
7466 bool IsVarArg = CLI.IsVarArg;
7467 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7468 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7469
7470 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7471 // for the shadow store.
7472 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7473 CCInfo.AllocateStack(32, Align(16));
7474
7475 unsigned NumArgs = Outs.size();
7476 for (unsigned i = 0; i != NumArgs; ++i) {
7477 MVT ArgVT = Outs[i].VT;
7478 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7479
7480 bool UseVarArgCC = false;
7481 if (IsVarArg) {
7482 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7483 // too, so use the vararg CC to force them to integer registers.
7484 if (IsCalleeWin64) {
7485 UseVarArgCC = true;
7486 } else {
7487 UseVarArgCC = !Outs[i].IsFixed;
7488 }
7489 }
7490
7491 if (!UseVarArgCC) {
7492 // Get type of the original argument.
7493 EVT ActualVT =
7494 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7495 /*AllowUnknown*/ true);
7496 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7497 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7498 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7499 ArgVT = MVT::i8;
7500 else if (ActualMVT == MVT::i16)
7501 ArgVT = MVT::i16;
7502 }
7503
7504 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7505 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7506 assert(!Res && "Call operand has unhandled type");
7507 (void)Res;
7508 }
7509}
7510
7511bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7512 const CallLoweringInfo &CLI) const {
7513 CallingConv::ID CalleeCC = CLI.CallConv;
7514 if (!mayTailCallThisCC(CalleeCC))
7515 return false;
7516
7517 SDValue Callee = CLI.Callee;
7518 bool IsVarArg = CLI.IsVarArg;
7519 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7520 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7521 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7522 const SelectionDAG &DAG = CLI.DAG;
7524 const Function &CallerF = MF.getFunction();
7525 CallingConv::ID CallerCC = CallerF.getCallingConv();
7526
7527 // SME Streaming functions are not eligible for TCO as they may require
7528 // the streaming mode or ZA to be restored after returning from the call.
7529 SMEAttrs CallerAttrs(MF.getFunction());
7530 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7531 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7532 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7533 CallerAttrs.hasStreamingBody())
7534 return false;
7535
7536 // Functions using the C or Fast calling convention that have an SVE signature
7537 // preserve more registers and should assume the SVE_VectorCall CC.
7538 // The check for matching callee-saved regs will determine whether it is
7539 // eligible for TCO.
7540 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7543
7544 bool CCMatch = CallerCC == CalleeCC;
7545
7546 // When using the Windows calling convention on a non-windows OS, we want
7547 // to back up and restore X18 in such functions; we can't do a tail call
7548 // from those functions.
7549 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7550 CalleeCC != CallingConv::Win64)
7551 return false;
7552
7553 // Byval parameters hand the function a pointer directly into the stack area
7554 // we want to reuse during a tail call. Working around this *is* possible (see
7555 // X86) but less efficient and uglier in LowerCall.
7556 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7557 e = CallerF.arg_end();
7558 i != e; ++i) {
7559 if (i->hasByValAttr())
7560 return false;
7561
7562 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7563 // In this case, it is necessary to save/restore X0 in the callee. Tail
7564 // call opt interferes with this. So we disable tail call opt when the
7565 // caller has an argument with "inreg" attribute.
7566
7567 // FIXME: Check whether the callee also has an "inreg" argument.
7568 if (i->hasInRegAttr())
7569 return false;
7570 }
7571
7572 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7573 return CCMatch;
7574
7575 // Externally-defined functions with weak linkage should not be
7576 // tail-called on AArch64 when the OS does not support dynamic
7577 // pre-emption of symbols, as the AAELF spec requires normal calls
7578 // to undefined weak functions to be replaced with a NOP or jump to the
7579 // next instruction. The behaviour of branch instructions in this
7580 // situation (as used for tail calls) is implementation-defined, so we
7581 // cannot rely on the linker replacing the tail call with a return.
7582 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7583 const GlobalValue *GV = G->getGlobal();
7585 if (GV->hasExternalWeakLinkage() &&
7586 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7587 return false;
7588 }
7589
7590 // Now we search for cases where we can use a tail call without changing the
7591 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7592 // concept.
7593
7594 // I want anyone implementing a new calling convention to think long and hard
7595 // about this assert.
7596 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7597 "Unexpected variadic calling convention");
7598
7599 LLVMContext &C = *DAG.getContext();
7600 // Check that the call results are passed in the same way.
7601 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7602 CCAssignFnForCall(CalleeCC, IsVarArg),
7603 CCAssignFnForCall(CallerCC, IsVarArg)))
7604 return false;
7605 // The callee has to preserve all registers the caller needs to preserve.
7606 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7607 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7608 if (!CCMatch) {
7609 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7610 if (Subtarget->hasCustomCallingConv()) {
7611 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7612 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7613 }
7614 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7615 return false;
7616 }
7617
7618 // Nothing more to check if the callee is taking no arguments
7619 if (Outs.empty())
7620 return true;
7621
7623 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7624
7625 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7626
7627 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7628 // When we are musttail, additional checks have been done and we can safely ignore this check
7629 // At least two cases here: if caller is fastcc then we can't have any
7630 // memory arguments (we'd be expected to clean up the stack afterwards). If
7631 // caller is C then we could potentially use its argument area.
7632
7633 // FIXME: for now we take the most conservative of these in both cases:
7634 // disallow all variadic memory operands.
7635 for (const CCValAssign &ArgLoc : ArgLocs)
7636 if (!ArgLoc.isRegLoc())
7637 return false;
7638 }
7639
7640 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7641
7642 // If any of the arguments is passed indirectly, it must be SVE, so the
7643 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7644 // allocate space on the stack. That is why we determine this explicitly here
7645 // the call cannot be a tailcall.
7646 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7647 assert((A.getLocInfo() != CCValAssign::Indirect ||
7648 A.getValVT().isScalableVector() ||
7649 Subtarget->isWindowsArm64EC()) &&
7650 "Expected value to be scalable");
7651 return A.getLocInfo() == CCValAssign::Indirect;
7652 }))
7653 return false;
7654
7655 // If the stack arguments for this call do not fit into our own save area then
7656 // the call cannot be made tail.
7657 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7658 return false;
7659
7660 const MachineRegisterInfo &MRI = MF.getRegInfo();
7661 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7662 return false;
7663
7664 return true;
7665}
7666
7667SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7668 SelectionDAG &DAG,
7669 MachineFrameInfo &MFI,
7670 int ClobberedFI) const {
7671 SmallVector<SDValue, 8> ArgChains;
7672 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7673 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7674
7675 // Include the original chain at the beginning of the list. When this is
7676 // used by target LowerCall hooks, this helps legalize find the
7677 // CALLSEQ_BEGIN node.
7678 ArgChains.push_back(Chain);
7679
7680 // Add a chain value for each stack argument corresponding
7681 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7682 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7683 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7684 if (FI->getIndex() < 0) {
7685 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7686 int64_t InLastByte = InFirstByte;
7687 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7688
7689 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7690 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7691 ArgChains.push_back(SDValue(L, 1));
7692 }
7693
7694 // Build a tokenfactor for all the chains.
7695 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7696}
7697
7698bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7699 bool TailCallOpt) const {
7700 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7701 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7702}
7703
7704// Check if the value is zero-extended from i1 to i8
7705static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7706 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7707 if (SizeInBits < 8)
7708 return false;
7709
7710 APInt RequredZero(SizeInBits, 0xFE);
7711 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7712 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7713 return ZExtBool;
7714}
7715
7716void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7717 SDNode *Node) const {
7718 // Live-in physreg copies that are glued to SMSTART are applied as
7719 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7720 // register allocator to pass call args in callee saved regs, without extra
7721 // copies to avoid these fake clobbers of actually-preserved GPRs.
7722 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7723 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7724 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7725 if (MachineOperand &MO = MI.getOperand(I);
7726 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7727 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7728 AArch64::GPR64RegClass.contains(MO.getReg())))
7729 MI.removeOperand(I);
7730
7731 // The SVE vector length can change when entering/leaving streaming mode.
7732 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7733 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7734 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7735 /*IsImplicit=*/true));
7736 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7737 /*IsImplicit=*/true));
7738 }
7739 }
7740
7741 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7742 // have nothing to do with VG, were it not that they are used to materialise a
7743 // frame-address. If they contain a frame-index to a scalable vector, this
7744 // will likely require an ADDVL instruction to materialise the address, thus
7745 // reading VG.
7746 const MachineFunction &MF = *MI.getMF();
7748 (MI.getOpcode() == AArch64::ADDXri ||
7749 MI.getOpcode() == AArch64::SUBXri)) {
7750 const MachineOperand &MO = MI.getOperand(1);
7751 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7753 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7754 /*IsImplicit=*/true));
7755 }
7756}
7757
7759 bool Enable, SDValue Chain,
7760 SDValue InGlue,
7761 unsigned Condition,
7762 SDValue PStateSM) const {
7765 FuncInfo->setHasStreamingModeChanges(true);
7766
7767 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7768 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7769 SDValue MSROp =
7770 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7771 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7772 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7773 if (Condition != AArch64SME::Always) {
7774 assert(PStateSM && "PStateSM should be defined");
7775 Ops.push_back(PStateSM);
7776 }
7777 Ops.push_back(RegMask);
7778
7779 if (InGlue)
7780 Ops.push_back(InGlue);
7781
7782 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7783 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7784}
7785
7786static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7787 const SMEAttrs &CalleeAttrs) {
7788 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7789 CallerAttrs.hasStreamingBody())
7790 return AArch64SME::Always;
7791 if (CalleeAttrs.hasNonStreamingInterface())
7793 if (CalleeAttrs.hasStreamingInterface())
7795
7796 llvm_unreachable("Unsupported attributes");
7797}
7798
7799/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7800/// and add input and output parameter nodes.
7801SDValue
7802AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7803 SmallVectorImpl<SDValue> &InVals) const {
7804 SelectionDAG &DAG = CLI.DAG;
7805 SDLoc &DL = CLI.DL;
7806 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7807 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7809 SDValue Chain = CLI.Chain;
7810 SDValue Callee = CLI.Callee;
7811 bool &IsTailCall = CLI.IsTailCall;
7812 CallingConv::ID &CallConv = CLI.CallConv;
7813 bool IsVarArg = CLI.IsVarArg;
7814
7817 bool IsThisReturn = false;
7818
7820 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7821 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7822 bool IsSibCall = false;
7823 bool GuardWithBTI = false;
7824
7825 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7826 !Subtarget->noBTIAtReturnTwice()) {
7827 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7828 }
7829
7830 // Analyze operands of the call, assigning locations to each operand.
7832 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7833
7834 if (IsVarArg) {
7835 unsigned NumArgs = Outs.size();
7836
7837 for (unsigned i = 0; i != NumArgs; ++i) {
7838 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7839 report_fatal_error("Passing SVE types to variadic functions is "
7840 "currently not supported");
7841 }
7842 }
7843
7844 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7845
7846 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7847 // Assign locations to each value returned by this call.
7849 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7850 *DAG.getContext());
7851 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7852
7853 // Check callee args/returns for SVE registers and set calling convention
7854 // accordingly.
7855 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7856 auto HasSVERegLoc = [](CCValAssign &Loc) {
7857 if (!Loc.isRegLoc())
7858 return false;
7859 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7860 AArch64::PPRRegClass.contains(Loc.getLocReg());
7861 };
7862 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7864 }
7865
7866 if (IsTailCall) {
7867 // Check if it's really possible to do a tail call.
7868 IsTailCall = isEligibleForTailCallOptimization(CLI);
7869
7870 // A sibling call is one where we're under the usual C ABI and not planning
7871 // to change that but can still do a tail call:
7872 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7873 CallConv != CallingConv::SwiftTail)
7874 IsSibCall = true;
7875
7876 if (IsTailCall)
7877 ++NumTailCalls;
7878 }
7879
7880 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7881 report_fatal_error("failed to perform tail call elimination on a call "
7882 "site marked musttail");
7883
7884 // Get a count of how many bytes are to be pushed on the stack.
7885 unsigned NumBytes = CCInfo.getStackSize();
7886
7887 if (IsSibCall) {
7888 // Since we're not changing the ABI to make this a tail call, the memory
7889 // operands are already available in the caller's incoming argument space.
7890 NumBytes = 0;
7891 }
7892
7893 // FPDiff is the byte offset of the call's argument area from the callee's.
7894 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7895 // by this amount for a tail call. In a sibling call it must be 0 because the
7896 // caller will deallocate the entire stack and the callee still expects its
7897 // arguments to begin at SP+0. Completely unused for non-tail calls.
7898 int FPDiff = 0;
7899
7900 if (IsTailCall && !IsSibCall) {
7901 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7902
7903 // Since callee will pop argument stack as a tail call, we must keep the
7904 // popped size 16-byte aligned.
7905 NumBytes = alignTo(NumBytes, 16);
7906
7907 // FPDiff will be negative if this tail call requires more space than we
7908 // would automatically have in our incoming argument space. Positive if we
7909 // can actually shrink the stack.
7910 FPDiff = NumReusableBytes - NumBytes;
7911
7912 // Update the required reserved area if this is the tail call requiring the
7913 // most argument stack space.
7914 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7915 FuncInfo->setTailCallReservedStack(-FPDiff);
7916
7917 // The stack pointer must be 16-byte aligned at all times it's used for a
7918 // memory operation, which in practice means at *all* times and in
7919 // particular across call boundaries. Therefore our own arguments started at
7920 // a 16-byte aligned SP and the delta applied for the tail call should
7921 // satisfy the same constraint.
7922 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7923 }
7924
7925 // Determine whether we need any streaming mode changes.
7926 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7927 if (CLI.CB)
7928 CalleeAttrs = SMEAttrs(*CLI.CB);
7929 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7930 CalleeAttrs = SMEAttrs(ES->getSymbol());
7931
7932 auto DescribeCallsite =
7934 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7935 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7936 R << ore::NV("Callee", ES->getSymbol());
7937 else if (CLI.CB && CLI.CB->getCalledFunction())
7938 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7939 else
7940 R << "unknown callee";
7941 R << "'";
7942 return R;
7943 };
7944
7945 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7946 if (RequiresLazySave) {
7947 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7949 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
7951 SDValue NumZaSaveSlicesAddr =
7952 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7953 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7954 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7955 DAG.getConstant(1, DL, MVT::i32));
7956 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7957 MPI, MVT::i16);
7958 Chain = DAG.getNode(
7959 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7960 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7961 TPIDR2ObjAddr);
7963 ORE.emit([&]() {
7964 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7965 CLI.CB)
7966 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7967 &MF.getFunction());
7968 return DescribeCallsite(R) << " sets up a lazy save for ZA";
7969 });
7970 }
7971
7972 SDValue PStateSM;
7973 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
7974 if (RequiresSMChange) {
7975 if (CallerAttrs.hasStreamingInterfaceOrBody())
7976 PStateSM = DAG.getConstant(1, DL, MVT::i64);
7977 else if (CallerAttrs.hasNonStreamingInterface())
7978 PStateSM = DAG.getConstant(0, DL, MVT::i64);
7979 else
7980 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7982 ORE.emit([&]() {
7983 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
7984 CLI.CB)
7985 : OptimizationRemarkAnalysis("sme", "SMETransition",
7986 &MF.getFunction());
7987 DescribeCallsite(R) << " requires a streaming mode transition";
7988 return R;
7989 });
7990 }
7991
7992 SDValue ZTFrameIdx;
7993 MachineFrameInfo &MFI = MF.getFrameInfo();
7994 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
7995
7996 // If the caller has ZT0 state which will not be preserved by the callee,
7997 // spill ZT0 before the call.
7998 if (ShouldPreserveZT0) {
7999 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8000 ZTFrameIdx = DAG.getFrameIndex(
8001 ZTObj,
8003
8004 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8005 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8006 }
8007
8008 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8009 // PSTATE.ZA before the call if there is no lazy-save active.
8010 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8011 assert((!DisableZA || !RequiresLazySave) &&
8012 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8013
8014 if (DisableZA)
8015 Chain = DAG.getNode(
8016 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8017 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8018 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8019
8020 // Adjust the stack pointer for the new arguments...
8021 // These operations are automatically eliminated by the prolog/epilog pass
8022 if (!IsSibCall)
8023 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8024
8025 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8027
8029 SmallSet<unsigned, 8> RegsUsed;
8030 SmallVector<SDValue, 8> MemOpChains;
8031 auto PtrVT = getPointerTy(DAG.getDataLayout());
8032
8033 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8034 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8035 for (const auto &F : Forwards) {
8036 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8037 RegsToPass.emplace_back(F.PReg, Val);
8038 }
8039 }
8040
8041 // Walk the register/memloc assignments, inserting copies/loads.
8042 unsigned ExtraArgLocs = 0;
8043 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8044 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8045 SDValue Arg = OutVals[i];
8046 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8047
8048 // Promote the value if needed.
8049 switch (VA.getLocInfo()) {
8050 default:
8051 llvm_unreachable("Unknown loc info!");
8052 case CCValAssign::Full:
8053 break;
8054 case CCValAssign::SExt:
8055 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8056 break;
8057 case CCValAssign::ZExt:
8058 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8059 break;
8060 case CCValAssign::AExt:
8061 if (Outs[i].ArgVT == MVT::i1) {
8062 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8063 //
8064 // Check if we actually have to do this, because the value may
8065 // already be zero-extended.
8066 //
8067 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8068 // and rely on DAGCombiner to fold this, because the following
8069 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8070 //
8071 // (ext (zext x)) -> (zext x)
8072 //
8073 // This will give us (zext i32), which we cannot remove, so
8074 // try to check this beforehand.
8075 if (!checkZExtBool(Arg, DAG)) {
8076 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8077 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8078 }
8079 }
8080 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8081 break;
8083 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8084 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8085 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8086 DAG.getConstant(32, DL, VA.getLocVT()));
8087 break;
8088 case CCValAssign::BCvt:
8089 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8090 break;
8091 case CCValAssign::Trunc:
8092 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8093 break;
8094 case CCValAssign::FPExt:
8095 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8096 break;
8098 bool isScalable = VA.getValVT().isScalableVT();
8099 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8100 "Indirect arguments should be scalable on most subtargets");
8101
8102 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8103 uint64_t PartSize = StoreSize;
8104 unsigned NumParts = 1;
8105 if (Outs[i].Flags.isInConsecutiveRegs()) {
8106 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8107 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8108 ++NumParts;
8109 StoreSize *= NumParts;
8110 }
8111
8112 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8113 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8114 MachineFrameInfo &MFI = MF.getFrameInfo();
8115 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8116 if (isScalable)
8118
8122 SDValue SpillSlot = Ptr;
8123
8124 // Ensure we generate all stores for each tuple part, whilst updating the
8125 // pointer after each store correctly using vscale.
8126 while (NumParts) {
8127 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8128 MemOpChains.push_back(Store);
8129
8130 NumParts--;
8131 if (NumParts > 0) {
8132 SDValue BytesIncrement;
8133 if (isScalable) {
8134 BytesIncrement = DAG.getVScale(
8135 DL, Ptr.getValueType(),
8136 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8137 } else {
8138 BytesIncrement = DAG.getConstant(
8139 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8140 Ptr.getValueType());
8141 }
8143 Flags.setNoUnsignedWrap(true);
8144
8145 MPI = MachinePointerInfo(MPI.getAddrSpace());
8146 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8147 BytesIncrement, Flags);
8148 ExtraArgLocs++;
8149 i++;
8150 }
8151 }
8152
8153 Arg = SpillSlot;
8154 break;
8155 }
8156
8157 if (VA.isRegLoc()) {
8158 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8159 Outs[0].VT == MVT::i64) {
8160 assert(VA.getLocVT() == MVT::i64 &&
8161 "unexpected calling convention register assignment");
8162 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8163 "unexpected use of 'returned'");
8164 IsThisReturn = true;
8165 }
8166 if (RegsUsed.count(VA.getLocReg())) {
8167 // If this register has already been used then we're trying to pack
8168 // parts of an [N x i32] into an X-register. The extension type will
8169 // take care of putting the two halves in the right place but we have to
8170 // combine them.
8171 SDValue &Bits =
8172 llvm::find_if(RegsToPass,
8173 [=](const std::pair<unsigned, SDValue> &Elt) {
8174 return Elt.first == VA.getLocReg();
8175 })
8176 ->second;
8177 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8178 // Call site info is used for function's parameter entry value
8179 // tracking. For now we track only simple cases when parameter
8180 // is transferred through whole register.
8182 [&VA](MachineFunction::ArgRegPair ArgReg) {
8183 return ArgReg.Reg == VA.getLocReg();
8184 });
8185 } else {
8186 // Add an extra level of indirection for streaming mode changes by
8187 // using a pseudo copy node that cannot be rematerialised between a
8188 // smstart/smstop and the call by the simple register coalescer.
8189 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8191 Arg.getValueType(), Arg);
8192 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8193 RegsUsed.insert(VA.getLocReg());
8194 const TargetOptions &Options = DAG.getTarget().Options;
8195 if (Options.EmitCallSiteInfo)
8196 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8197 }
8198 } else {
8199 assert(VA.isMemLoc());
8200
8201 SDValue DstAddr;
8202 MachinePointerInfo DstInfo;
8203
8204 // FIXME: This works on big-endian for composite byvals, which are the
8205 // common case. It should also work for fundamental types too.
8206 uint32_t BEAlign = 0;
8207 unsigned OpSize;
8208 if (VA.getLocInfo() == CCValAssign::Indirect ||
8210 OpSize = VA.getLocVT().getFixedSizeInBits();
8211 else
8212 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8213 : VA.getValVT().getSizeInBits();
8214 OpSize = (OpSize + 7) / 8;
8215 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8216 !Flags.isInConsecutiveRegs()) {
8217 if (OpSize < 8)
8218 BEAlign = 8 - OpSize;
8219 }
8220 unsigned LocMemOffset = VA.getLocMemOffset();
8221 int32_t Offset = LocMemOffset + BEAlign;
8222 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8223 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8224
8225 if (IsTailCall) {
8226 Offset = Offset + FPDiff;
8227 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8228
8229 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8230 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8231
8232 // Make sure any stack arguments overlapping with where we're storing
8233 // are loaded before this eventual operation. Otherwise they'll be
8234 // clobbered.
8235 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8236 } else {
8237 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8238
8239 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8240 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8241 }
8242
8243 if (Outs[i].Flags.isByVal()) {
8244 SDValue SizeNode =
8245 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8246 SDValue Cpy = DAG.getMemcpy(
8247 Chain, DL, DstAddr, Arg, SizeNode,
8248 Outs[i].Flags.getNonZeroByValAlign(),
8249 /*isVol = */ false, /*AlwaysInline = */ false,
8250 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8251
8252 MemOpChains.push_back(Cpy);
8253 } else {
8254 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8255 // promoted to a legal register type i32, we should truncate Arg back to
8256 // i1/i8/i16.
8257 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8258 VA.getValVT() == MVT::i16)
8259 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8260
8261 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8262 MemOpChains.push_back(Store);
8263 }
8264 }
8265 }
8266
8267 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8268 SDValue ParamPtr = StackPtr;
8269 if (IsTailCall) {
8270 // Create a dummy object at the top of the stack that can be used to get
8271 // the SP after the epilogue
8272 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8273 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8274 }
8275
8276 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8277 // describing the argument list. x4 contains the address of the
8278 // first stack parameter. x5 contains the size in bytes of all parameters
8279 // passed on the stack.
8280 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8281 RegsToPass.emplace_back(AArch64::X5,
8282 DAG.getConstant(NumBytes, DL, MVT::i64));
8283 }
8284
8285 if (!MemOpChains.empty())
8286 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8287
8288 SDValue InGlue;
8289 if (RequiresSMChange) {
8290 SDValue NewChain = changeStreamingMode(
8291 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8292 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8293 Chain = NewChain.getValue(0);
8294 InGlue = NewChain.getValue(1);
8295 }
8296
8297 // Build a sequence of copy-to-reg nodes chained together with token chain
8298 // and flag operands which copy the outgoing args into the appropriate regs.
8299 for (auto &RegToPass : RegsToPass) {
8300 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8301 RegToPass.second, InGlue);
8302 InGlue = Chain.getValue(1);
8303 }
8304
8305 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8306 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8307 // node so that legalize doesn't hack it.
8308 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8309 auto GV = G->getGlobal();
8310 unsigned OpFlags =
8312 if (OpFlags & AArch64II::MO_GOT) {
8313 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8314 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8315 } else {
8316 const GlobalValue *GV = G->getGlobal();
8317 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8318 }
8319 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8320 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8321 Subtarget->isTargetMachO()) ||
8323 const char *Sym = S->getSymbol();
8324 if (UseGot) {
8326 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8327 } else {
8328 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8329 }
8330 }
8331
8332 // We don't usually want to end the call-sequence here because we would tidy
8333 // the frame up *after* the call, however in the ABI-changing tail-call case
8334 // we've carefully laid out the parameters so that when sp is reset they'll be
8335 // in the correct location.
8336 if (IsTailCall && !IsSibCall) {
8337 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8338 InGlue = Chain.getValue(1);
8339 }
8340
8341 std::vector<SDValue> Ops;
8342 Ops.push_back(Chain);
8343 Ops.push_back(Callee);
8344
8345 if (IsTailCall) {
8346 // Each tail call may have to adjust the stack by a different amount, so
8347 // this information must travel along with the operation for eventual
8348 // consumption by emitEpilogue.
8349 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8350 }
8351
8352 // Add argument registers to the end of the list so that they are known live
8353 // into the call.
8354 for (auto &RegToPass : RegsToPass)
8355 Ops.push_back(DAG.getRegister(RegToPass.first,
8356 RegToPass.second.getValueType()));
8357
8358 // Add a register mask operand representing the call-preserved registers.
8359 const uint32_t *Mask;
8360 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8361 if (IsThisReturn) {
8362 // For 'this' returns, use the X0-preserving mask if applicable
8363 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8364 if (!Mask) {
8365 IsThisReturn = false;
8366 Mask = TRI->getCallPreservedMask(MF, CallConv);
8367 }
8368 } else
8369 Mask = TRI->getCallPreservedMask(MF, CallConv);
8370
8371 if (Subtarget->hasCustomCallingConv())
8372 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8373
8374 if (TRI->isAnyArgRegReserved(MF))
8375 TRI->emitReservedArgRegCallError(MF);
8376
8377 assert(Mask && "Missing call preserved mask for calling convention");
8378 Ops.push_back(DAG.getRegisterMask(Mask));
8379
8380 if (InGlue.getNode())
8381 Ops.push_back(InGlue);
8382
8383 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8384
8385 // If we're doing a tall call, use a TC_RETURN here rather than an
8386 // actual call instruction.
8387 if (IsTailCall) {
8389 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8390
8391 if (IsCFICall)
8392 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8393
8394 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8395 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8396 return Ret;
8397 }
8398
8399 unsigned CallOpc = AArch64ISD::CALL;
8400 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8401 // be expanded to the call, directly followed by a special marker sequence and
8402 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8403 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8404 assert(!IsTailCall &&
8405 "tail calls cannot be marked with clang.arc.attachedcall");
8406 CallOpc = AArch64ISD::CALL_RVMARKER;
8407
8408 // Add a target global address for the retainRV/claimRV runtime function
8409 // just before the call target.
8410 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8411 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8412 Ops.insert(Ops.begin() + 1, GA);
8413 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8415 } else if (GuardWithBTI) {
8416 CallOpc = AArch64ISD::CALL_BTI;
8417 }
8418
8419 // Returns a chain and a flag for retval copy to use.
8420 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8421
8422 if (IsCFICall)
8423 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8424
8425 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8426 InGlue = Chain.getValue(1);
8427 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8428
8429 uint64_t CalleePopBytes =
8430 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8431
8432 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8433 InGlue = Chain.getValue(1);
8434
8435 // Handle result values, copying them out of physregs into vregs that we
8436 // return.
8437 SDValue Result = LowerCallResult(
8438 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8439 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8440
8441 if (!Ins.empty())
8442 InGlue = Result.getValue(Result->getNumValues() - 1);
8443
8444 if (RequiresSMChange) {
8445 assert(PStateSM && "Expected a PStateSM to be set");
8447 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8448 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8449 }
8450
8451 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8452 // Unconditionally resume ZA.
8453 Result = DAG.getNode(
8454 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8455 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8456 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8457
8458 if (ShouldPreserveZT0)
8459 Result =
8460 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8461 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8462
8463 if (RequiresLazySave) {
8464 // Conditionally restore the lazy save using a pseudo node.
8465 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8466 SDValue RegMask = DAG.getRegisterMask(
8467 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8468 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8469 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8470 SDValue TPIDR2_EL0 = DAG.getNode(
8471 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8472 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8473
8474 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8475 // RESTORE_ZA pseudo.
8476 SDValue Glue;
8477 SDValue TPIDR2Block = DAG.getFrameIndex(
8479 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8480 Result =
8481 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8482 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8483 RestoreRoutine, RegMask, Result.getValue(1)});
8484
8485 // Finally reset the TPIDR2_EL0 register to 0.
8486 Result = DAG.getNode(
8487 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8488 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8489 DAG.getConstant(0, DL, MVT::i64));
8490 }
8491
8492 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8493 for (unsigned I = 0; I < InVals.size(); ++I) {
8494 // The smstart/smstop is chained as part of the call, but when the
8495 // resulting chain is discarded (which happens when the call is not part
8496 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8497 // smstart/smstop is chained to the result value. We can do that by doing
8498 // a vreg -> vreg copy.
8500 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8501 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8502 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8503 InVals[I].getValueType());
8504 }
8505 }
8506
8507 return Result;
8508}
8509
8510bool AArch64TargetLowering::CanLowerReturn(
8511 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8512 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8513 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8515 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8516 return CCInfo.CheckReturn(Outs, RetCC);
8517}
8518
8519SDValue
8520AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8521 bool isVarArg,
8523 const SmallVectorImpl<SDValue> &OutVals,
8524 const SDLoc &DL, SelectionDAG &DAG) const {
8525 auto &MF = DAG.getMachineFunction();
8526 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8527
8528 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8530 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8531 CCInfo.AnalyzeReturn(Outs, RetCC);
8532
8533 // Copy the result values into the output registers.
8534 SDValue Glue;
8536 SmallSet<unsigned, 4> RegsUsed;
8537 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8538 ++i, ++realRVLocIdx) {
8539 CCValAssign &VA = RVLocs[i];
8540 assert(VA.isRegLoc() && "Can only return in registers!");
8541 SDValue Arg = OutVals[realRVLocIdx];
8542
8543 switch (VA.getLocInfo()) {
8544 default:
8545 llvm_unreachable("Unknown loc info!");
8546 case CCValAssign::Full:
8547 if (Outs[i].ArgVT == MVT::i1) {
8548 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8549 // value. This is strictly redundant on Darwin (which uses "zeroext
8550 // i1"), but will be optimised out before ISel.
8551 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8552 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8553 }
8554 break;
8555 case CCValAssign::BCvt:
8556 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8557 break;
8558 case CCValAssign::AExt:
8559 case CCValAssign::ZExt:
8560 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8561 break;
8563 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8564 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8565 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8566 DAG.getConstant(32, DL, VA.getLocVT()));
8567 break;
8568 }
8569
8570 if (RegsUsed.count(VA.getLocReg())) {
8571 SDValue &Bits =
8572 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8573 return Elt.first == VA.getLocReg();
8574 })->second;
8575 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8576 } else {
8577 RetVals.emplace_back(VA.getLocReg(), Arg);
8578 RegsUsed.insert(VA.getLocReg());
8579 }
8580 }
8581
8582 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8583
8584 // Emit SMSTOP before returning from a locally streaming function
8585 SMEAttrs FuncAttrs(MF.getFunction());
8586 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8587 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8588 Register Reg = FuncInfo->getPStateSMReg();
8589 assert(Reg.isValid() && "PStateSM Register is invalid");
8590 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8591 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8592 /*Glue*/ SDValue(),
8594 } else
8595 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8596 /*Glue*/ SDValue(), AArch64SME::Always);
8597 Glue = Chain.getValue(1);
8598 }
8599
8600 SmallVector<SDValue, 4> RetOps(1, Chain);
8601 for (auto &RetVal : RetVals) {
8602 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8603 isPassedInFPR(RetVal.second.getValueType()))
8604 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8605 RetVal.second.getValueType(), RetVal.second);
8606 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8607 Glue = Chain.getValue(1);
8608 RetOps.push_back(
8609 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8610 }
8611
8612 // Windows AArch64 ABIs require that for returning structs by value we copy
8613 // the sret argument into X0 for the return.
8614 // We saved the argument into a virtual register in the entry block,
8615 // so now we copy the value out and into X0.
8616 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8617 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8619
8620 unsigned RetValReg = AArch64::X0;
8621 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8622 RetValReg = AArch64::X8;
8623 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8624 Glue = Chain.getValue(1);
8625
8626 RetOps.push_back(
8627 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8628 }
8629
8630 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8631 if (I) {
8632 for (; *I; ++I) {
8633 if (AArch64::GPR64RegClass.contains(*I))
8634 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8635 else if (AArch64::FPR64RegClass.contains(*I))
8636 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8637 else
8638 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8639 }
8640 }
8641
8642 RetOps[0] = Chain; // Update chain.
8643
8644 // Add the glue if we have it.
8645 if (Glue.getNode())
8646 RetOps.push_back(Glue);
8647
8648 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8649 // ARM64EC entry thunks use a special return sequence: instead of a regular
8650 // "ret" instruction, they need to explicitly call the emulator.
8651 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8652 SDValue Arm64ECRetDest =
8653 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8654 Arm64ECRetDest =
8655 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8656 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8658 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8659 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8660 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8661 }
8662
8663 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8664}
8665
8666//===----------------------------------------------------------------------===//
8667// Other Lowering Code
8668//===----------------------------------------------------------------------===//
8669
8670SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8671 SelectionDAG &DAG,
8672 unsigned Flag) const {
8673 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8674 N->getOffset(), Flag);
8675}
8676
8677SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8678 SelectionDAG &DAG,
8679 unsigned Flag) const {
8680 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8681}
8682
8683SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8684 SelectionDAG &DAG,
8685 unsigned Flag) const {
8686 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8687 N->getOffset(), Flag);
8688}
8689
8690SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8691 SelectionDAG &DAG,
8692 unsigned Flag) const {
8693 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8694}
8695
8696SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8697 SelectionDAG &DAG,
8698 unsigned Flag) const {
8699 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8700}
8701
8702// (loadGOT sym)
8703template <class NodeTy>
8704SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8705 unsigned Flags) const {
8706 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8707 SDLoc DL(N);
8708 EVT Ty = getPointerTy(DAG.getDataLayout());
8709 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8710 // FIXME: Once remat is capable of dealing with instructions with register
8711 // operands, expand this into two nodes instead of using a wrapper node.
8712 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8713}
8714
8715// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8716template <class NodeTy>
8717SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8718 unsigned Flags) const {
8719 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8720 SDLoc DL(N);
8721 EVT Ty = getPointerTy(DAG.getDataLayout());
8722 const unsigned char MO_NC = AArch64II::MO_NC;
8723 return DAG.getNode(
8725 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8726 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8727 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8728 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8729}
8730
8731// (addlow (adrp %hi(sym)) %lo(sym))
8732template <class NodeTy>
8733SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8734 unsigned Flags) const {
8735 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8736 SDLoc DL(N);
8737 EVT Ty = getPointerTy(DAG.getDataLayout());
8738 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8739 SDValue Lo = getTargetNode(N, Ty, DAG,
8742 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8743}
8744
8745// (adr sym)
8746template <class NodeTy>
8747SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8748 unsigned Flags) const {
8749 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8750 SDLoc DL(N);
8751 EVT Ty = getPointerTy(DAG.getDataLayout());
8752 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8753 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8754}
8755
8756SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8757 SelectionDAG &DAG) const {
8758 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8759 const GlobalValue *GV = GN->getGlobal();
8760 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8761
8762 if (OpFlags != AArch64II::MO_NO_FLAG)
8763 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8764 "unexpected offset in global node");
8765
8766 // This also catches the large code model case for Darwin, and tiny code
8767 // model with got relocations.
8768 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8769 return getGOT(GN, DAG, OpFlags);
8770 }
8771
8775 Result = getAddrLarge(GN, DAG, OpFlags);
8776 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8777 Result = getAddrTiny(GN, DAG, OpFlags);
8778 } else {
8779 Result = getAddr(GN, DAG, OpFlags);
8780 }
8781 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8782 SDLoc DL(GN);
8784 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8786 return Result;
8787}
8788
8789/// Convert a TLS address reference into the correct sequence of loads
8790/// and calls to compute the variable's address (for Darwin, currently) and
8791/// return an SDValue containing the final node.
8792
8793/// Darwin only has one TLS scheme which must be capable of dealing with the
8794/// fully general situation, in the worst case. This means:
8795/// + "extern __thread" declaration.
8796/// + Defined in a possibly unknown dynamic library.
8797///
8798/// The general system is that each __thread variable has a [3 x i64] descriptor
8799/// which contains information used by the runtime to calculate the address. The
8800/// only part of this the compiler needs to know about is the first xword, which
8801/// contains a function pointer that must be called with the address of the
8802/// entire descriptor in "x0".
8803///
8804/// Since this descriptor may be in a different unit, in general even the
8805/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8806/// is:
8807/// adrp x0, _var@TLVPPAGE
8808/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8809/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8810/// ; the function pointer
8811/// blr x1 ; Uses descriptor address in x0
8812/// ; Address of _var is now in x0.
8813///
8814/// If the address of _var's descriptor *is* known to the linker, then it can
8815/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8816/// a slight efficiency gain.
8817SDValue
8818AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8819 SelectionDAG &DAG) const {
8820 assert(Subtarget->isTargetDarwin() &&
8821 "This function expects a Darwin target");
8822
8823 SDLoc DL(Op);
8824 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8825 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8826 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8827
8828 SDValue TLVPAddr =
8829 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8830 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8831
8832 // The first entry in the descriptor is a function pointer that we must call
8833 // to obtain the address of the variable.
8834 SDValue Chain = DAG.getEntryNode();
8835 SDValue FuncTLVGet = DAG.getLoad(
8836 PtrMemVT, DL, Chain, DescAddr,
8838 Align(PtrMemVT.getSizeInBits() / 8),
8840 Chain = FuncTLVGet.getValue(1);
8841
8842 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8843 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8844
8846 MFI.setAdjustsStack(true);
8847
8848 // TLS calls preserve all registers except those that absolutely must be
8849 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8850 // silly).
8851 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8852 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8853 if (Subtarget->hasCustomCallingConv())
8854 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8855
8856 // Finally, we can make the call. This is just a degenerate version of a
8857 // normal AArch64 call node: x0 takes the address of the descriptor, and
8858 // returns the address of the variable in this thread.
8859 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8860 Chain =
8861 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8862 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8863 DAG.getRegisterMask(Mask), Chain.getValue(1));
8864 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8865}
8866
8867/// Convert a thread-local variable reference into a sequence of instructions to
8868/// compute the variable's address for the local exec TLS model of ELF targets.
8869/// The sequence depends on the maximum TLS area size.
8870SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8871 SDValue ThreadBase,
8872 const SDLoc &DL,
8873 SelectionDAG &DAG) const {
8874 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8875 SDValue TPOff, Addr;
8876
8877 switch (DAG.getTarget().Options.TLSSize) {
8878 default:
8879 llvm_unreachable("Unexpected TLS size");
8880
8881 case 12: {
8882 // mrs x0, TPIDR_EL0
8883 // add x0, x0, :tprel_lo12:a
8885 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8886 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8887 Var,
8888 DAG.getTargetConstant(0, DL, MVT::i32)),
8889 0);
8890 }
8891
8892 case 24: {
8893 // mrs x0, TPIDR_EL0
8894 // add x0, x0, :tprel_hi12:a
8895 // add x0, x0, :tprel_lo12_nc:a
8896 SDValue HiVar = DAG.getTargetGlobalAddress(
8897 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8898 SDValue LoVar = DAG.getTargetGlobalAddress(
8899 GV, DL, PtrVT, 0,
8901 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8902 HiVar,
8903 DAG.getTargetConstant(0, DL, MVT::i32)),
8904 0);
8905 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8906 LoVar,
8907 DAG.getTargetConstant(0, DL, MVT::i32)),
8908 0);
8909 }
8910
8911 case 32: {
8912 // mrs x1, TPIDR_EL0
8913 // movz x0, #:tprel_g1:a
8914 // movk x0, #:tprel_g0_nc:a
8915 // add x0, x1, x0
8916 SDValue HiVar = DAG.getTargetGlobalAddress(
8917 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8918 SDValue LoVar = DAG.getTargetGlobalAddress(
8919 GV, DL, PtrVT, 0,
8921 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8922 DAG.getTargetConstant(16, DL, MVT::i32)),
8923 0);
8924 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8925 DAG.getTargetConstant(0, DL, MVT::i32)),
8926 0);
8927 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8928 }
8929
8930 case 48: {
8931 // mrs x1, TPIDR_EL0
8932 // movz x0, #:tprel_g2:a
8933 // movk x0, #:tprel_g1_nc:a
8934 // movk x0, #:tprel_g0_nc:a
8935 // add x0, x1, x0
8936 SDValue HiVar = DAG.getTargetGlobalAddress(
8937 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8938 SDValue MiVar = DAG.getTargetGlobalAddress(
8939 GV, DL, PtrVT, 0,
8941 SDValue LoVar = DAG.getTargetGlobalAddress(
8942 GV, DL, PtrVT, 0,
8944 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8945 DAG.getTargetConstant(32, DL, MVT::i32)),
8946 0);
8947 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8948 DAG.getTargetConstant(16, DL, MVT::i32)),
8949 0);
8950 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8951 DAG.getTargetConstant(0, DL, MVT::i32)),
8952 0);
8953 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8954 }
8955 }
8956}
8957
8958/// When accessing thread-local variables under either the general-dynamic or
8959/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8960/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8961/// is a function pointer to carry out the resolution.
8962///
8963/// The sequence is:
8964/// adrp x0, :tlsdesc:var
8965/// ldr x1, [x0, #:tlsdesc_lo12:var]
8966/// add x0, x0, #:tlsdesc_lo12:var
8967/// .tlsdesccall var
8968/// blr x1
8969/// (TPIDR_EL0 offset now in x0)
8970///
8971/// The above sequence must be produced unscheduled, to enable the linker to
8972/// optimize/relax this sequence.
8973/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
8974/// above sequence, and expanded really late in the compilation flow, to ensure
8975/// the sequence is produced as per above.
8976SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
8977 const SDLoc &DL,
8978 SelectionDAG &DAG) const {
8979 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8980
8981 SDValue Chain = DAG.getEntryNode();
8982 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8983
8984 Chain =
8985 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
8986 SDValue Glue = Chain.getValue(1);
8987
8988 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
8989}
8990
8991SDValue
8992AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
8993 SelectionDAG &DAG) const {
8994 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
8995
8996 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8997
8999
9001 if (Model == TLSModel::LocalDynamic)
9003 }
9004
9006 Model != TLSModel::LocalExec)
9007 report_fatal_error("ELF TLS only supported in small memory model or "
9008 "in local exec TLS model");
9009 // Different choices can be made for the maximum size of the TLS area for a
9010 // module. For the small address model, the default TLS size is 16MiB and the
9011 // maximum TLS size is 4GiB.
9012 // FIXME: add tiny and large code model support for TLS access models other
9013 // than local exec. We currently generate the same code as small for tiny,
9014 // which may be larger than needed.
9015
9016 SDValue TPOff;
9017 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9018 SDLoc DL(Op);
9019 const GlobalValue *GV = GA->getGlobal();
9020
9021 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9022
9023 if (Model == TLSModel::LocalExec) {
9024 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9025 } else if (Model == TLSModel::InitialExec) {
9026 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9027 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9028 } else if (Model == TLSModel::LocalDynamic) {
9029 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9030 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9031 // the beginning of the module's TLS region, followed by a DTPREL offset
9032 // calculation.
9033
9034 // These accesses will need deduplicating if there's more than one.
9035 AArch64FunctionInfo *MFI =
9038
9039 // The call needs a relocation too for linker relaxation. It doesn't make
9040 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9041 // the address.
9042 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9044
9045 // Now we can calculate the offset from TPIDR_EL0 to this module's
9046 // thread-local area.
9047 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9048
9049 // Now use :dtprel_whatever: operations to calculate this variable's offset
9050 // in its thread-storage area.
9051 SDValue HiVar = DAG.getTargetGlobalAddress(
9052 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9053 SDValue LoVar = DAG.getTargetGlobalAddress(
9054 GV, DL, MVT::i64, 0,
9056
9057 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9058 DAG.getTargetConstant(0, DL, MVT::i32)),
9059 0);
9060 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9061 DAG.getTargetConstant(0, DL, MVT::i32)),
9062 0);
9063 } else if (Model == TLSModel::GeneralDynamic) {
9064 // The call needs a relocation too for linker relaxation. It doesn't make
9065 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9066 // the address.
9067 SDValue SymAddr =
9068 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9069
9070 // Finally we can make a call to calculate the offset from tpidr_el0.
9071 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9072 } else
9073 llvm_unreachable("Unsupported ELF TLS access model");
9074
9075 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9076}
9077
9078SDValue
9079AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9080 SelectionDAG &DAG) const {
9081 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9082
9083 SDValue Chain = DAG.getEntryNode();
9084 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9085 SDLoc DL(Op);
9086
9087 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9088
9089 // Load the ThreadLocalStoragePointer from the TEB
9090 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9091 SDValue TLSArray =
9092 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9093 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9094 Chain = TLSArray.getValue(1);
9095
9096 // Load the TLS index from the C runtime;
9097 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9098 // This also does the same as LOADgot, but using a generic i32 load,
9099 // while LOADgot only loads i64.
9100 SDValue TLSIndexHi =
9101 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9102 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9103 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9104 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9105 SDValue TLSIndex =
9106 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9107 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9108 Chain = TLSIndex.getValue(1);
9109
9110 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9111 // offset into the TLSArray.
9112 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9113 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9114 DAG.getConstant(3, DL, PtrVT));
9115 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9116 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9118 Chain = TLS.getValue(1);
9119
9120 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9121 const GlobalValue *GV = GA->getGlobal();
9122 SDValue TGAHi = DAG.getTargetGlobalAddress(
9123 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9124 SDValue TGALo = DAG.getTargetGlobalAddress(
9125 GV, DL, PtrVT, 0,
9127
9128 // Add the offset from the start of the .tls section (section base).
9129 SDValue Addr =
9130 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9131 DAG.getTargetConstant(0, DL, MVT::i32)),
9132 0);
9133 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9134 return Addr;
9135}
9136
9137SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9138 SelectionDAG &DAG) const {
9139 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9140 if (DAG.getTarget().useEmulatedTLS())
9141 return LowerToTLSEmulatedModel(GA, DAG);
9142
9143 if (Subtarget->isTargetDarwin())
9144 return LowerDarwinGlobalTLSAddress(Op, DAG);
9145 if (Subtarget->isTargetELF())
9146 return LowerELFGlobalTLSAddress(Op, DAG);
9147 if (Subtarget->isTargetWindows())
9148 return LowerWindowsGlobalTLSAddress(Op, DAG);
9149
9150 llvm_unreachable("Unexpected platform trying to use TLS");
9151}
9152
9153// Looks through \param Val to determine the bit that can be used to
9154// check the sign of the value. It returns the unextended value and
9155// the sign bit position.
9156std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9157 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9158 return {Val.getOperand(0),
9159 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9160 1};
9161
9162 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9163 return {Val.getOperand(0),
9164 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9165
9166 return {Val, Val.getValueSizeInBits() - 1};
9167}
9168
9169SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9170 SDValue Chain = Op.getOperand(0);
9171 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9172 SDValue LHS = Op.getOperand(2);
9173 SDValue RHS = Op.getOperand(3);
9174 SDValue Dest = Op.getOperand(4);
9175 SDLoc dl(Op);
9176
9178 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9179 // will not be produced, as they are conditional branch instructions that do
9180 // not set flags.
9181 bool ProduceNonFlagSettingCondBr =
9182 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9183
9184 // Handle f128 first, since lowering it will result in comparing the return
9185 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9186 // is expecting to deal with.
9187 if (LHS.getValueType() == MVT::f128) {
9188 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9189
9190 // If softenSetCCOperands returned a scalar, we need to compare the result
9191 // against zero to select between true and false values.
9192 if (!RHS.getNode()) {
9193 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9194 CC = ISD::SETNE;
9195 }
9196 }
9197
9198 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9199 // instruction.
9200 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9201 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9202 // Only lower legal XALUO ops.
9203 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9204 return SDValue();
9205
9206 // The actual operation with overflow check.
9208 SDValue Value, Overflow;
9209 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9210
9211 if (CC == ISD::SETNE)
9212 OFCC = getInvertedCondCode(OFCC);
9213 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9214
9215 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9216 Overflow);
9217 }
9218
9219 if (LHS.getValueType().isInteger()) {
9220 assert((LHS.getValueType() == RHS.getValueType()) &&
9221 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9222
9223 // If the RHS of the comparison is zero, we can potentially fold this
9224 // to a specialized branch.
9225 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9226 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9227 if (CC == ISD::SETEQ) {
9228 // See if we can use a TBZ to fold in an AND as well.
9229 // TBZ has a smaller branch displacement than CBZ. If the offset is
9230 // out of bounds, a late MI-layer pass rewrites branches.
9231 // 403.gcc is an example that hits this case.
9232 if (LHS.getOpcode() == ISD::AND &&
9233 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9234 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9235 SDValue Test = LHS.getOperand(0);
9236 uint64_t Mask = LHS.getConstantOperandVal(1);
9237 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9238 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9239 Dest);
9240 }
9241
9242 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9243 } else if (CC == ISD::SETNE) {
9244 // See if we can use a TBZ to fold in an AND as well.
9245 // TBZ has a smaller branch displacement than CBZ. If the offset is
9246 // out of bounds, a late MI-layer pass rewrites branches.
9247 // 403.gcc is an example that hits this case.
9248 if (LHS.getOpcode() == ISD::AND &&
9249 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9250 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9251 SDValue Test = LHS.getOperand(0);
9252 uint64_t Mask = LHS.getConstantOperandVal(1);
9253 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9254 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9255 Dest);
9256 }
9257
9258 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9259 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9260 // Don't combine AND since emitComparison converts the AND to an ANDS
9261 // (a.k.a. TST) and the test in the test bit and branch instruction
9262 // becomes redundant. This would also increase register pressure.
9263 uint64_t SignBitPos;
9264 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9265 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9266 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9267 }
9268 }
9269 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9270 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9271 // Don't combine AND since emitComparison converts the AND to an ANDS
9272 // (a.k.a. TST) and the test in the test bit and branch instruction
9273 // becomes redundant. This would also increase register pressure.
9274 uint64_t SignBitPos;
9275 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9276 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9277 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9278 }
9279
9280 SDValue CCVal;
9281 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9282 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9283 Cmp);
9284 }
9285
9286 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9287 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9288
9289 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9290 // clean. Some of them require two branches to implement.
9291 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9292 AArch64CC::CondCode CC1, CC2;
9293 changeFPCCToAArch64CC(CC, CC1, CC2);
9294 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9295 SDValue BR1 =
9296 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9297 if (CC2 != AArch64CC::AL) {
9298 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9299 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9300 Cmp);
9301 }
9302
9303 return BR1;
9304}
9305
9306SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9307 SelectionDAG &DAG) const {
9308 if (!Subtarget->hasNEON())
9309 return SDValue();
9310
9311 EVT VT = Op.getValueType();
9312 EVT IntVT = VT.changeTypeToInteger();
9313 SDLoc DL(Op);
9314
9315 SDValue In1 = Op.getOperand(0);
9316 SDValue In2 = Op.getOperand(1);
9317 EVT SrcVT = In2.getValueType();
9318
9319 if (!SrcVT.bitsEq(VT))
9320 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9321
9322 if (VT.isScalableVector())
9323 IntVT =
9325
9326 if (VT.isFixedLengthVector() &&
9327 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9328 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9329
9330 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9331 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9332
9333 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9334 return convertFromScalableVector(DAG, VT, Res);
9335 }
9336
9337 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9338 if (VT.isScalableVector())
9339 return getSVESafeBitCast(VT, Op, DAG);
9340
9341 return DAG.getBitcast(VT, Op);
9342 };
9343
9344 SDValue VecVal1, VecVal2;
9345 EVT VecVT;
9346 auto SetVecVal = [&](int Idx = -1) {
9347 if (!VT.isVector()) {
9348 VecVal1 =
9349 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9350 VecVal2 =
9351 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9352 } else {
9353 VecVal1 = BitCast(VecVT, In1, DAG);
9354 VecVal2 = BitCast(VecVT, In2, DAG);
9355 }
9356 };
9357 if (VT.isVector()) {
9358 VecVT = IntVT;
9359 SetVecVal();
9360 } else if (VT == MVT::f64) {
9361 VecVT = MVT::v2i64;
9362 SetVecVal(AArch64::dsub);
9363 } else if (VT == MVT::f32) {
9364 VecVT = MVT::v4i32;
9365 SetVecVal(AArch64::ssub);
9366 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9367 VecVT = MVT::v8i16;
9368 SetVecVal(AArch64::hsub);
9369 } else {
9370 llvm_unreachable("Invalid type for copysign!");
9371 }
9372
9373 unsigned BitWidth = In1.getScalarValueSizeInBits();
9374 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9375
9376 // We want to materialize a mask with every bit but the high bit set, but the
9377 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9378 // 64-bit elements. Instead, materialize all bits set and then negate that.
9379 if (VT == MVT::f64 || VT == MVT::v2f64) {
9380 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9381 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9382 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9383 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9384 }
9385
9386 SDValue BSP =
9387 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9388 if (VT == MVT::f16 || VT == MVT::bf16)
9389 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9390 if (VT == MVT::f32)
9391 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9392 if (VT == MVT::f64)
9393 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9394
9395 return BitCast(VT, BSP, DAG);
9396}
9397
9398SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9399 SelectionDAG &DAG) const {
9401 Attribute::NoImplicitFloat))
9402 return SDValue();
9403
9404 if (!Subtarget->hasNEON())
9405 return SDValue();
9406
9407 bool IsParity = Op.getOpcode() == ISD::PARITY;
9408 SDValue Val = Op.getOperand(0);
9409 SDLoc DL(Op);
9410 EVT VT = Op.getValueType();
9411
9412 // for i32, general parity function using EORs is more efficient compared to
9413 // using floating point
9414 if (VT == MVT::i32 && IsParity)
9415 return SDValue();
9416
9417 // If there is no CNT instruction available, GPR popcount can
9418 // be more efficiently lowered to the following sequence that uses
9419 // AdvSIMD registers/instructions as long as the copies to/from
9420 // the AdvSIMD registers are cheap.
9421 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9422 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9423 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9424 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9425 if (VT == MVT::i32 || VT == MVT::i64) {
9426 if (VT == MVT::i32)
9427 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9428 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9429
9430 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9431 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9432 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9433 DAG.getConstant(0, DL, MVT::i64));
9434
9435 if (IsParity)
9436 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9437 DAG.getConstant(1, DL, MVT::i32));
9438
9439 if (VT == MVT::i64)
9440 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9441 return UaddLV;
9442 } else if (VT == MVT::i128) {
9443 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9444
9445 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9446 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9447 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9448 DAG.getConstant(0, DL, MVT::i64));
9449
9450 if (IsParity)
9451 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9452 DAG.getConstant(1, DL, MVT::i32));
9453
9454 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9455 }
9456
9457 assert(!IsParity && "ISD::PARITY of vector types not supported");
9458
9459 if (VT.isScalableVector() ||
9461 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9462
9463 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9464 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9465 "Unexpected type for custom ctpop lowering");
9466
9467 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9468 Val = DAG.getBitcast(VT8Bit, Val);
9469 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9470
9471 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9472 unsigned EltSize = 8;
9473 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9474 while (EltSize != VT.getScalarSizeInBits()) {
9475 EltSize *= 2;
9476 NumElts /= 2;
9477 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9478 Val = DAG.getNode(
9479 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9480 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9481 }
9482
9483 return Val;
9484}
9485
9486SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9487 EVT VT = Op.getValueType();
9488 assert(VT.isScalableVector() ||
9490 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9491
9492 SDLoc DL(Op);
9493 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9494 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9495}
9496
9497SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9498 SelectionDAG &DAG) const {
9499
9500 EVT VT = Op.getValueType();
9501 SDLoc DL(Op);
9502 unsigned Opcode = Op.getOpcode();
9504 switch (Opcode) {
9505 default:
9506 llvm_unreachable("Wrong instruction");
9507 case ISD::SMAX:
9508 CC = ISD::SETGT;
9509 break;
9510 case ISD::SMIN:
9511 CC = ISD::SETLT;
9512 break;
9513 case ISD::UMAX:
9514 CC = ISD::SETUGT;
9515 break;
9516 case ISD::UMIN:
9517 CC = ISD::SETULT;
9518 break;
9519 }
9520
9521 if (VT.isScalableVector() ||
9523 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9524 switch (Opcode) {
9525 default:
9526 llvm_unreachable("Wrong instruction");
9527 case ISD::SMAX:
9528 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9529 case ISD::SMIN:
9530 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9531 case ISD::UMAX:
9532 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9533 case ISD::UMIN:
9534 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9535 }
9536 }
9537
9538 SDValue Op0 = Op.getOperand(0);
9539 SDValue Op1 = Op.getOperand(1);
9540 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9541 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9542}
9543
9544SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9545 SelectionDAG &DAG) const {
9546 EVT VT = Op.getValueType();
9547
9548 if (VT.isScalableVector() ||
9550 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9551 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9552
9553 SDLoc DL(Op);
9554 SDValue REVB;
9555 MVT VST;
9556
9557 switch (VT.getSimpleVT().SimpleTy) {
9558 default:
9559 llvm_unreachable("Invalid type for bitreverse!");
9560
9561 case MVT::v2i32: {
9562 VST = MVT::v8i8;
9563 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9564
9565 break;
9566 }
9567
9568 case MVT::v4i32: {
9569 VST = MVT::v16i8;
9570 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9571
9572 break;
9573 }
9574
9575 case MVT::v1i64: {
9576 VST = MVT::v8i8;
9577 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9578
9579 break;
9580 }
9581
9582 case MVT::v2i64: {
9583 VST = MVT::v16i8;
9584 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9585
9586 break;
9587 }
9588 }
9589
9590 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9591 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9592}
9593
9594// Check whether the continuous comparison sequence.
9595static bool
9596isOrXorChain(SDValue N, unsigned &Num,
9597 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9598 if (Num == MaxXors)
9599 return false;
9600
9601 // Skip the one-use zext
9602 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9603 N = N->getOperand(0);
9604
9605 // The leaf node must be XOR
9606 if (N->getOpcode() == ISD::XOR) {
9607 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9608 Num++;
9609 return true;
9610 }
9611
9612 // All the non-leaf nodes must be OR.
9613 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9614 return false;
9615
9616 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9617 isOrXorChain(N->getOperand(1), Num, WorkList))
9618 return true;
9619 return false;
9620}
9621
9622// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9624 SDValue LHS = N->getOperand(0);
9625 SDValue RHS = N->getOperand(1);
9626 SDLoc DL(N);
9627 EVT VT = N->getValueType(0);
9629
9630 // Only handle integer compares.
9631 if (N->getOpcode() != ISD::SETCC)
9632 return SDValue();
9633
9634 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9635 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9636 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9637 unsigned NumXors = 0;
9638 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9639 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9640 isOrXorChain(LHS, NumXors, WorkList)) {
9641 SDValue XOR0, XOR1;
9642 std::tie(XOR0, XOR1) = WorkList[0];
9643 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9644 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9645 for (unsigned I = 1; I < WorkList.size(); I++) {
9646 std::tie(XOR0, XOR1) = WorkList[I];
9647 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9648 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9649 }
9650
9651 // Exit early by inverting the condition, which help reduce indentations.
9652 return Cmp;
9653 }
9654
9655 return SDValue();
9656}
9657
9658SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9659
9660 if (Op.getValueType().isVector())
9661 return LowerVSETCC(Op, DAG);
9662
9663 bool IsStrict = Op->isStrictFPOpcode();
9664 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9665 unsigned OpNo = IsStrict ? 1 : 0;
9666 SDValue Chain;
9667 if (IsStrict)
9668 Chain = Op.getOperand(0);
9669 SDValue LHS = Op.getOperand(OpNo + 0);
9670 SDValue RHS = Op.getOperand(OpNo + 1);
9671 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9672 SDLoc dl(Op);
9673
9674 // We chose ZeroOrOneBooleanContents, so use zero and one.
9675 EVT VT = Op.getValueType();
9676 SDValue TVal = DAG.getConstant(1, dl, VT);
9677 SDValue FVal = DAG.getConstant(0, dl, VT);
9678
9679 // Handle f128 first, since one possible outcome is a normal integer
9680 // comparison which gets picked up by the next if statement.
9681 if (LHS.getValueType() == MVT::f128) {
9682 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9683 IsSignaling);
9684
9685 // If softenSetCCOperands returned a scalar, use it.
9686 if (!RHS.getNode()) {
9687 assert(LHS.getValueType() == Op.getValueType() &&
9688 "Unexpected setcc expansion!");
9689 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9690 }
9691 }
9692
9693 if (LHS.getValueType().isInteger()) {
9694 SDValue CCVal;
9696 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9697
9698 // Note that we inverted the condition above, so we reverse the order of
9699 // the true and false operands here. This will allow the setcc to be
9700 // matched to a single CSINC instruction.
9701 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9702 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9703 }
9704
9705 // Now we know we're dealing with FP values.
9706 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9707 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9708
9709 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9710 // and do the comparison.
9711 SDValue Cmp;
9712 if (IsStrict)
9713 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9714 else
9715 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9716
9717 AArch64CC::CondCode CC1, CC2;
9718 changeFPCCToAArch64CC(CC, CC1, CC2);
9719 SDValue Res;
9720 if (CC2 == AArch64CC::AL) {
9721 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9722 CC2);
9723 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9724
9725 // Note that we inverted the condition above, so we reverse the order of
9726 // the true and false operands here. This will allow the setcc to be
9727 // matched to a single CSINC instruction.
9728 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9729 } else {
9730 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9731 // totally clean. Some of them require two CSELs to implement. As is in
9732 // this case, we emit the first CSEL and then emit a second using the output
9733 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9734
9735 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9736 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9737 SDValue CS1 =
9738 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9739
9740 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9741 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9742 }
9743 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9744}
9745
9746SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9747 SelectionDAG &DAG) const {
9748
9749 SDValue LHS = Op.getOperand(0);
9750 SDValue RHS = Op.getOperand(1);
9751 EVT VT = LHS.getValueType();
9752 if (VT != MVT::i32 && VT != MVT::i64)
9753 return SDValue();
9754
9755 SDLoc DL(Op);
9756 SDValue Carry = Op.getOperand(2);
9757 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9758 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9759 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9760 LHS, RHS, InvCarry);
9761
9762 EVT OpVT = Op.getValueType();
9763 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9764 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9765
9766 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9768 SDValue CCVal =
9769 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9770 // Inputs are swapped because the condition is inverted. This will allow
9771 // matching with a single CSINC instruction.
9772 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9773 Cmp.getValue(1));
9774}
9775
9776SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9777 SDValue RHS, SDValue TVal,
9778 SDValue FVal, const SDLoc &dl,
9779 SelectionDAG &DAG) const {
9780 // Handle f128 first, because it will result in a comparison of some RTLIB
9781 // call result against zero.
9782 if (LHS.getValueType() == MVT::f128) {
9783 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9784
9785 // If softenSetCCOperands returned a scalar, we need to compare the result
9786 // against zero to select between true and false values.
9787 if (!RHS.getNode()) {
9788 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9789 CC = ISD::SETNE;
9790 }
9791 }
9792
9793 // Also handle f16, for which we need to do a f32 comparison.
9794 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9795 LHS.getValueType() == MVT::bf16) {
9796 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9797 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9798 }
9799
9800 // Next, handle integers.
9801 if (LHS.getValueType().isInteger()) {
9802 assert((LHS.getValueType() == RHS.getValueType()) &&
9803 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9804
9805 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9806 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9807 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9808 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9809 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9810 // supported types.
9811 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9812 CTVal->isOne() && CFVal->isAllOnes() &&
9813 LHS.getValueType() == TVal.getValueType()) {
9814 EVT VT = LHS.getValueType();
9815 SDValue Shift =
9816 DAG.getNode(ISD::SRA, dl, VT, LHS,
9817 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9818 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9819 }
9820
9821 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9822 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9823 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9824 // Both require less instructions than compare and conditional select.
9825 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9826 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9827 LHS.getValueType() == RHS.getValueType()) {
9828 EVT VT = LHS.getValueType();
9829 SDValue Shift =
9830 DAG.getNode(ISD::SRA, dl, VT, LHS,
9831 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9832
9833 if (CC == ISD::SETGT)
9834 Shift = DAG.getNOT(dl, Shift, VT);
9835
9836 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9837 }
9838
9839 unsigned Opcode = AArch64ISD::CSEL;
9840
9841 // If both the TVal and the FVal are constants, see if we can swap them in
9842 // order to for a CSINV or CSINC out of them.
9843 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9844 std::swap(TVal, FVal);
9845 std::swap(CTVal, CFVal);
9846 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9847 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9848 std::swap(TVal, FVal);
9849 std::swap(CTVal, CFVal);
9850 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9851 } else if (TVal.getOpcode() == ISD::XOR) {
9852 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9853 // with a CSINV rather than a CSEL.
9854 if (isAllOnesConstant(TVal.getOperand(1))) {
9855 std::swap(TVal, FVal);
9856 std::swap(CTVal, CFVal);
9857 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9858 }
9859 } else if (TVal.getOpcode() == ISD::SUB) {
9860 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9861 // that we can match with a CSNEG rather than a CSEL.
9862 if (isNullConstant(TVal.getOperand(0))) {
9863 std::swap(TVal, FVal);
9864 std::swap(CTVal, CFVal);
9865 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9866 }
9867 } else if (CTVal && CFVal) {
9868 const int64_t TrueVal = CTVal->getSExtValue();
9869 const int64_t FalseVal = CFVal->getSExtValue();
9870 bool Swap = false;
9871
9872 // If both TVal and FVal are constants, see if FVal is the
9873 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9874 // instead of a CSEL in that case.
9875 if (TrueVal == ~FalseVal) {
9876 Opcode = AArch64ISD::CSINV;
9877 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9878 TrueVal == -FalseVal) {
9879 Opcode = AArch64ISD::CSNEG;
9880 } else if (TVal.getValueType() == MVT::i32) {
9881 // If our operands are only 32-bit wide, make sure we use 32-bit
9882 // arithmetic for the check whether we can use CSINC. This ensures that
9883 // the addition in the check will wrap around properly in case there is
9884 // an overflow (which would not be the case if we do the check with
9885 // 64-bit arithmetic).
9886 const uint32_t TrueVal32 = CTVal->getZExtValue();
9887 const uint32_t FalseVal32 = CFVal->getZExtValue();
9888
9889 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9890 Opcode = AArch64ISD::CSINC;
9891
9892 if (TrueVal32 > FalseVal32) {
9893 Swap = true;
9894 }
9895 }
9896 } else {
9897 // 64-bit check whether we can use CSINC.
9898 const uint64_t TrueVal64 = TrueVal;
9899 const uint64_t FalseVal64 = FalseVal;
9900
9901 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9902 Opcode = AArch64ISD::CSINC;
9903
9904 if (TrueVal > FalseVal) {
9905 Swap = true;
9906 }
9907 }
9908 }
9909
9910 // Swap TVal and FVal if necessary.
9911 if (Swap) {
9912 std::swap(TVal, FVal);
9913 std::swap(CTVal, CFVal);
9914 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9915 }
9916
9917 if (Opcode != AArch64ISD::CSEL) {
9918 // Drop FVal since we can get its value by simply inverting/negating
9919 // TVal.
9920 FVal = TVal;
9921 }
9922 }
9923
9924 // Avoid materializing a constant when possible by reusing a known value in
9925 // a register. However, don't perform this optimization if the known value
9926 // is one, zero or negative one in the case of a CSEL. We can always
9927 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9928 // FVal, respectively.
9929 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9930 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9931 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9933 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9934 // "a != C ? x : a" to avoid materializing C.
9935 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9936 TVal = LHS;
9937 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9938 FVal = LHS;
9939 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9940 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9941 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9942 // avoid materializing C.
9944 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9945 Opcode = AArch64ISD::CSINV;
9946 TVal = LHS;
9947 FVal = DAG.getConstant(0, dl, FVal.getValueType());
9948 }
9949 }
9950
9951 SDValue CCVal;
9952 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9953 EVT VT = TVal.getValueType();
9954 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
9955 }
9956
9957 // Now we know we're dealing with FP values.
9958 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9959 LHS.getValueType() == MVT::f64);
9960 assert(LHS.getValueType() == RHS.getValueType());
9961 EVT VT = TVal.getValueType();
9962 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9963
9964 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9965 // clean. Some of them require two CSELs to implement.
9966 AArch64CC::CondCode CC1, CC2;
9967 changeFPCCToAArch64CC(CC, CC1, CC2);
9968
9969 if (DAG.getTarget().Options.UnsafeFPMath) {
9970 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
9971 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
9972 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
9973 if (RHSVal && RHSVal->isZero()) {
9974 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
9975 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
9976
9977 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
9978 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
9979 TVal = LHS;
9980 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
9981 CFVal && CFVal->isZero() &&
9982 FVal.getValueType() == LHS.getValueType())
9983 FVal = LHS;
9984 }
9985 }
9986
9987 // Emit first, and possibly only, CSEL.
9988 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9989 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9990
9991 // If we need a second CSEL, emit it, using the output of the first as the
9992 // RHS. We're effectively OR'ing the two CC's together.
9993 if (CC2 != AArch64CC::AL) {
9994 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9995 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9996 }
9997
9998 // Otherwise, return the output of the first CSEL.
9999 return CS1;
10000}
10001
10002SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10003 SelectionDAG &DAG) const {
10004 EVT Ty = Op.getValueType();
10005 auto Idx = Op.getConstantOperandAPInt(2);
10006 int64_t IdxVal = Idx.getSExtValue();
10007 assert(Ty.isScalableVector() &&
10008 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10009
10010 // We can use the splice instruction for certain index values where we are
10011 // able to efficiently generate the correct predicate. The index will be
10012 // inverted and used directly as the input to the ptrue instruction, i.e.
10013 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10014 // splice predicate. However, we can only do this if we can guarantee that
10015 // there are enough elements in the vector, hence we check the index <= min
10016 // number of elements.
10017 std::optional<unsigned> PredPattern;
10018 if (Ty.isScalableVector() && IdxVal < 0 &&
10019 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10020 std::nullopt) {
10021 SDLoc DL(Op);
10022
10023 // Create a predicate where all but the last -IdxVal elements are false.
10024 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10025 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10026 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10027
10028 // Now splice the two inputs together using the predicate.
10029 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10030 Op.getOperand(1));
10031 }
10032
10033 // This will select to an EXT instruction, which has a maximum immediate
10034 // value of 255, hence 2048-bits is the maximum value we can lower.
10035 if (IdxVal >= 0 &&
10036 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
10037 return Op;
10038
10039 return SDValue();
10040}
10041
10042SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10043 SelectionDAG &DAG) const {
10044 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10045 SDValue LHS = Op.getOperand(0);
10046 SDValue RHS = Op.getOperand(1);
10047 SDValue TVal = Op.getOperand(2);
10048 SDValue FVal = Op.getOperand(3);
10049 SDLoc DL(Op);
10050 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10051}
10052
10053SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10054 SelectionDAG &DAG) const {
10055 SDValue CCVal = Op->getOperand(0);
10056 SDValue TVal = Op->getOperand(1);
10057 SDValue FVal = Op->getOperand(2);
10058 SDLoc DL(Op);
10059
10060 EVT Ty = Op.getValueType();
10061 if (Ty == MVT::aarch64svcount) {
10062 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10063 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10064 SDValue Sel =
10065 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10066 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10067 }
10068
10069 if (Ty.isScalableVector()) {
10070 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10071 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10072 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10073 }
10074
10075 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10076 // FIXME: Ideally this would be the same as above using i1 types, however
10077 // for the moment we can't deal with fixed i1 vector types properly, so
10078 // instead extend the predicate to a result type sized integer vector.
10079 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10080 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10081 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10082 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10083 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10084 }
10085
10086 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10087 // instruction.
10088 if (ISD::isOverflowIntrOpRes(CCVal)) {
10089 // Only lower legal XALUO ops.
10090 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10091 return SDValue();
10092
10094 SDValue Value, Overflow;
10095 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10096 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10097
10098 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10099 CCVal, Overflow);
10100 }
10101
10102 // Lower it the same way as we would lower a SELECT_CC node.
10104 SDValue LHS, RHS;
10105 if (CCVal.getOpcode() == ISD::SETCC) {
10106 LHS = CCVal.getOperand(0);
10107 RHS = CCVal.getOperand(1);
10108 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10109 } else {
10110 LHS = CCVal;
10111 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10112 CC = ISD::SETNE;
10113 }
10114
10115 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10116 // order to use FCSELSrrr
10117 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10118 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10119 DAG.getUNDEF(MVT::f32), TVal);
10120 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10121 DAG.getUNDEF(MVT::f32), FVal);
10122 }
10123
10124 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10125
10126 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10127 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10128 }
10129
10130 return Res;
10131}
10132
10133SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10134 SelectionDAG &DAG) const {
10135 // Jump table entries as PC relative offsets. No additional tweaking
10136 // is necessary here. Just get the address of the jump table.
10137 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10138
10141 !Subtarget->isTargetMachO())
10142 return getAddrLarge(JT, DAG);
10143 if (CM == CodeModel::Tiny)
10144 return getAddrTiny(JT, DAG);
10145 return getAddr(JT, DAG);
10146}
10147
10148SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10149 SelectionDAG &DAG) const {
10150 // Jump table entries as PC relative offsets. No additional tweaking
10151 // is necessary here. Just get the address of the jump table.
10152 SDLoc DL(Op);
10153 SDValue JT = Op.getOperand(1);
10154 SDValue Entry = Op.getOperand(2);
10155 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10156
10157 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10158 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10159
10160 SDNode *Dest =
10161 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10162 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10163 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10164 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10165}
10166
10167SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10168 SelectionDAG &DAG) const {
10169 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10171 if (CM == CodeModel::Large) {
10172 // Use the GOT for the large code model on iOS.
10173 if (Subtarget->isTargetMachO()) {
10174 return getGOT(CP, DAG);
10175 }
10177 return getAddrLarge(CP, DAG);
10178 } else if (CM == CodeModel::Tiny) {
10179 return getAddrTiny(CP, DAG);
10180 }
10181 return getAddr(CP, DAG);
10182}
10183
10184SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10185 SelectionDAG &DAG) const {
10186 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10188 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10190 return getAddrLarge(BA, DAG);
10191 } else if (CM == CodeModel::Tiny) {
10192 return getAddrTiny(BA, DAG);
10193 }
10194 return getAddr(BA, DAG);
10195}
10196
10197SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10198 SelectionDAG &DAG) const {
10199 AArch64FunctionInfo *FuncInfo =
10201
10202 SDLoc DL(Op);
10203 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10205 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10206 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10207 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10208 MachinePointerInfo(SV));
10209}
10210
10211SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10212 SelectionDAG &DAG) const {
10215
10216 SDLoc DL(Op);
10217 SDValue FR;
10218 if (Subtarget->isWindowsArm64EC()) {
10219 // With the Arm64EC ABI, we compute the address of the varargs save area
10220 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10221 // but calls from an entry thunk can pass in a different address.
10222 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10223 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10225 if (FuncInfo->getVarArgsGPRSize() > 0)
10226 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10227 else
10228 StackOffset = FuncInfo->getVarArgsStackOffset();
10229 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10230 DAG.getConstant(StackOffset, DL, MVT::i64));
10231 } else {
10232 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10233 ? FuncInfo->getVarArgsGPRIndex()
10234 : FuncInfo->getVarArgsStackIndex(),
10236 }
10237 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10238 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10239 MachinePointerInfo(SV));
10240}
10241
10242SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10243 SelectionDAG &DAG) const {
10244 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10245 // Standard, section B.3.
10248 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10249 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10250 auto PtrVT = getPointerTy(DAG.getDataLayout());
10251 SDLoc DL(Op);
10252
10253 SDValue Chain = Op.getOperand(0);
10254 SDValue VAList = Op.getOperand(1);
10255 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10257
10258 // void *__stack at offset 0
10259 unsigned Offset = 0;
10260 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10261 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10262 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10263 MachinePointerInfo(SV), Align(PtrSize)));
10264
10265 // void *__gr_top at offset 8 (4 on ILP32)
10266 Offset += PtrSize;
10267 int GPRSize = FuncInfo->getVarArgsGPRSize();
10268 if (GPRSize > 0) {
10269 SDValue GRTop, GRTopAddr;
10270
10271 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10272 DAG.getConstant(Offset, DL, PtrVT));
10273
10274 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10275 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10276 DAG.getConstant(GPRSize, DL, PtrVT));
10277 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10278
10279 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10281 Align(PtrSize)));
10282 }
10283
10284 // void *__vr_top at offset 16 (8 on ILP32)
10285 Offset += PtrSize;
10286 int FPRSize = FuncInfo->getVarArgsFPRSize();
10287 if (FPRSize > 0) {
10288 SDValue VRTop, VRTopAddr;
10289 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10290 DAG.getConstant(Offset, DL, PtrVT));
10291
10292 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10293 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10294 DAG.getConstant(FPRSize, DL, PtrVT));
10295 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10296
10297 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10299 Align(PtrSize)));
10300 }
10301
10302 // int __gr_offs at offset 24 (12 on ILP32)
10303 Offset += PtrSize;
10304 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10305 DAG.getConstant(Offset, DL, PtrVT));
10306 MemOps.push_back(
10307 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10308 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10309
10310 // int __vr_offs at offset 28 (16 on ILP32)
10311 Offset += 4;
10312 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10313 DAG.getConstant(Offset, DL, PtrVT));
10314 MemOps.push_back(
10315 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10316 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10317
10318 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10319}
10320
10321SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10322 SelectionDAG &DAG) const {
10324
10325 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10326 return LowerWin64_VASTART(Op, DAG);
10327 else if (Subtarget->isTargetDarwin())
10328 return LowerDarwin_VASTART(Op, DAG);
10329 else
10330 return LowerAAPCS_VASTART(Op, DAG);
10331}
10332
10333SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10334 SelectionDAG &DAG) const {
10335 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10336 // pointer.
10337 SDLoc DL(Op);
10338 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10339 unsigned VaListSize =
10340 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10341 ? PtrSize
10342 : Subtarget->isTargetILP32() ? 20 : 32;
10343 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10344 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10345
10346 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10347 DAG.getConstant(VaListSize, DL, MVT::i32),
10348 Align(PtrSize), false, false, false,
10349 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10350}
10351
10352SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10353 assert(Subtarget->isTargetDarwin() &&
10354 "automatic va_arg instruction only works on Darwin");
10355
10356 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10357 EVT VT = Op.getValueType();
10358 SDLoc DL(Op);
10359 SDValue Chain = Op.getOperand(0);
10360 SDValue Addr = Op.getOperand(1);
10361 MaybeAlign Align(Op.getConstantOperandVal(3));
10362 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10363 auto PtrVT = getPointerTy(DAG.getDataLayout());
10364 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10365 SDValue VAList =
10366 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10367 Chain = VAList.getValue(1);
10368 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10369
10370 if (VT.isScalableVector())
10371 report_fatal_error("Passing SVE types to variadic functions is "
10372 "currently not supported");
10373
10374 if (Align && *Align > MinSlotSize) {
10375 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10376 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10377 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10378 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10379 }
10380
10381 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10382 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10383
10384 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10385 // up to 64 bits. At the very least, we have to increase the striding of the
10386 // vaargs list to match this, and for FP values we need to introduce
10387 // FP_ROUND nodes as well.
10388 if (VT.isInteger() && !VT.isVector())
10389 ArgSize = std::max(ArgSize, MinSlotSize);
10390 bool NeedFPTrunc = false;
10391 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10392 ArgSize = 8;
10393 NeedFPTrunc = true;
10394 }
10395
10396 // Increment the pointer, VAList, to the next vaarg
10397 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10398 DAG.getConstant(ArgSize, DL, PtrVT));
10399 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10400
10401 // Store the incremented VAList to the legalized pointer
10402 SDValue APStore =
10403 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10404
10405 // Load the actual argument out of the pointer VAList
10406 if (NeedFPTrunc) {
10407 // Load the value as an f64.
10408 SDValue WideFP =
10409 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10410 // Round the value down to an f32.
10411 SDValue NarrowFP =
10412 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10413 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10414 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10415 // Merge the rounded value with the chain output of the load.
10416 return DAG.getMergeValues(Ops, DL);
10417 }
10418
10419 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10420}
10421
10422SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10423 SelectionDAG &DAG) const {
10425 MFI.setFrameAddressIsTaken(true);
10426
10427 EVT VT = Op.getValueType();
10428 SDLoc DL(Op);
10429 unsigned Depth = Op.getConstantOperandVal(0);
10430 SDValue FrameAddr =
10431 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10432 while (Depth--)
10433 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10435
10436 if (Subtarget->isTargetILP32())
10437 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10438 DAG.getValueType(VT));
10439
10440 return FrameAddr;
10441}
10442
10443SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10444 SelectionDAG &DAG) const {
10446
10447 EVT VT = getPointerTy(DAG.getDataLayout());
10448 SDLoc DL(Op);
10449 int FI = MFI.CreateFixedObject(4, 0, false);
10450 return DAG.getFrameIndex(FI, VT);
10451}
10452
10453#define GET_REGISTER_MATCHER
10454#include "AArch64GenAsmMatcher.inc"
10455
10456// FIXME? Maybe this could be a TableGen attribute on some registers and
10457// this table could be generated automatically from RegInfo.
10458Register AArch64TargetLowering::
10459getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10461 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10462 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10463 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10464 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10465 !MRI->isReservedReg(MF, Reg))
10466 Reg = 0;
10467 }
10468 if (Reg)
10469 return Reg;
10470 report_fatal_error(Twine("Invalid register name \""
10471 + StringRef(RegName) + "\"."));
10472}
10473
10474SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10475 SelectionDAG &DAG) const {
10477
10478 EVT VT = Op.getValueType();
10479 SDLoc DL(Op);
10480
10481 SDValue FrameAddr =
10482 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10484
10485 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10486}
10487
10488SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10489 SelectionDAG &DAG) const {
10491 MachineFrameInfo &MFI = MF.getFrameInfo();
10492 MFI.setReturnAddressIsTaken(true);
10493
10494 EVT VT = Op.getValueType();
10495 SDLoc DL(Op);
10496 unsigned Depth = Op.getConstantOperandVal(0);
10497 SDValue ReturnAddress;
10498 if (Depth) {
10499 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10501 ReturnAddress = DAG.getLoad(
10502 VT, DL, DAG.getEntryNode(),
10503 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10504 } else {
10505 // Return LR, which contains the return address. Mark it an implicit
10506 // live-in.
10507 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10508 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10509 }
10510
10511 // The XPACLRI instruction assembles to a hint-space instruction before
10512 // Armv8.3-A therefore this instruction can be safely used for any pre
10513 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10514 // that instead.
10515 SDNode *St;
10516 if (Subtarget->hasPAuth()) {
10517 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10518 } else {
10519 // XPACLRI operates on LR therefore we must move the operand accordingly.
10520 SDValue Chain =
10521 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10522 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10523 }
10524 return SDValue(St, 0);
10525}
10526
10527/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10528/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10529SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10530 SelectionDAG &DAG) const {
10531 SDValue Lo, Hi;
10532 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10533 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10534}
10535
10537 const GlobalAddressSDNode *GA) const {
10538 // Offsets are folded in the DAG combine rather than here so that we can
10539 // intelligently choose an offset based on the uses.
10540 return false;
10541}
10542
10544 bool OptForSize) const {
10545 bool IsLegal = false;
10546 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10547 // 16-bit case when target has full fp16 support.
10548 // We encode bf16 bit patterns as if they were fp16. This results in very
10549 // strange looking assembly but should populate the register with appropriate
10550 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10551 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10552 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10553 // FIXME: We should be able to handle f128 as well with a clever lowering.
10554 const APInt ImmInt = Imm.bitcastToAPInt();
10555 if (VT == MVT::f64)
10556 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10557 else if (VT == MVT::f32)
10558 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10559 else if (VT == MVT::f16 || VT == MVT::bf16)
10560 IsLegal =
10561 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10562 Imm.isPosZero();
10563
10564 // If we can not materialize in immediate field for fmov, check if the
10565 // value can be encoded as the immediate operand of a logical instruction.
10566 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10567 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10568 // generate that fmov.
10569 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10570 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10571 // however the mov+fmov sequence is always better because of the reduced
10572 // cache pressure. The timings are still the same if you consider
10573 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10574 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10577 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10578 IsLegal = Insn.size() <= Limit;
10579 }
10580
10581 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10582 << " imm value: "; Imm.dump(););
10583 return IsLegal;
10584}
10585
10586//===----------------------------------------------------------------------===//
10587// AArch64 Optimization Hooks
10588//===----------------------------------------------------------------------===//
10589
10590static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10591 SDValue Operand, SelectionDAG &DAG,
10592 int &ExtraSteps) {
10593 EVT VT = Operand.getValueType();
10594 if ((ST->hasNEON() &&
10595 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10596 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10597 VT == MVT::v4f32)) ||
10598 (ST->hasSVE() &&
10599 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10601 // For the reciprocal estimates, convergence is quadratic, so the number
10602 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10603 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10604 // the result for float (23 mantissa bits) is 2 and for double (52
10605 // mantissa bits) is 3.
10606 constexpr unsigned AccurateBits = 8;
10607 unsigned DesiredBits =
10609 ExtraSteps = DesiredBits <= AccurateBits
10610 ? 0
10611 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10612 }
10613
10614 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10615 }
10616
10617 return SDValue();
10618}
10619
10620SDValue
10621AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10622 const DenormalMode &Mode) const {
10623 SDLoc DL(Op);
10624 EVT VT = Op.getValueType();
10625 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10626 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10627 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10628}
10629
10630SDValue
10631AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10632 SelectionDAG &DAG) const {
10633 return Op;
10634}
10635
10636SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10637 SelectionDAG &DAG, int Enabled,
10638 int &ExtraSteps,
10639 bool &UseOneConst,
10640 bool Reciprocal) const {
10642 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10643 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10644 DAG, ExtraSteps)) {
10645 SDLoc DL(Operand);
10646 EVT VT = Operand.getValueType();
10647
10649 Flags.setAllowReassociation(true);
10650
10651 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10652 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10653 for (int i = ExtraSteps; i > 0; --i) {
10654 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10655 Flags);
10656 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10657 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10658 }
10659 if (!Reciprocal)
10660 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10661
10662 ExtraSteps = 0;
10663 return Estimate;
10664 }
10665
10666 return SDValue();
10667}
10668
10669SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10670 SelectionDAG &DAG, int Enabled,
10671 int &ExtraSteps) const {
10673 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10674 DAG, ExtraSteps)) {
10675 SDLoc DL(Operand);
10676 EVT VT = Operand.getValueType();
10677
10679 Flags.setAllowReassociation(true);
10680
10681 // Newton reciprocal iteration: E * (2 - X * E)
10682 // AArch64 reciprocal iteration instruction: (2 - M * N)
10683 for (int i = ExtraSteps; i > 0; --i) {
10684 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10685 Estimate, Flags);
10686 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10687 }
10688
10689 ExtraSteps = 0;
10690 return Estimate;
10691 }
10692
10693 return SDValue();
10694}
10695
10696//===----------------------------------------------------------------------===//
10697// AArch64 Inline Assembly Support
10698//===----------------------------------------------------------------------===//
10699
10700// Table of Constraints
10701// TODO: This is the current set of constraints supported by ARM for the
10702// compiler, not all of them may make sense.
10703//
10704// r - A general register
10705// w - An FP/SIMD register of some size in the range v0-v31
10706// x - An FP/SIMD register of some size in the range v0-v15
10707// I - Constant that can be used with an ADD instruction
10708// J - Constant that can be used with a SUB instruction
10709// K - Constant that can be used with a 32-bit logical instruction
10710// L - Constant that can be used with a 64-bit logical instruction
10711// M - Constant that can be used as a 32-bit MOV immediate
10712// N - Constant that can be used as a 64-bit MOV immediate
10713// Q - A memory reference with base register and no offset
10714// S - A symbolic address
10715// Y - Floating point constant zero
10716// Z - Integer constant zero
10717//
10718// Note that general register operands will be output using their 64-bit x
10719// register name, whatever the size of the variable, unless the asm operand
10720// is prefixed by the %w modifier. Floating-point and SIMD register operands
10721// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10722// %q modifier.
10723const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10724 // At this point, we have to lower this constraint to something else, so we
10725 // lower it to an "r" or "w". However, by doing this we will force the result
10726 // to be in register, while the X constraint is much more permissive.
10727 //
10728 // Although we are correct (we are free to emit anything, without
10729 // constraints), we might break use cases that would expect us to be more
10730 // efficient and emit something else.
10731 if (!Subtarget->hasFPARMv8())
10732 return "r";
10733
10734 if (ConstraintVT.isFloatingPoint())
10735 return "w";
10736
10737 if (ConstraintVT.isVector() &&
10738 (ConstraintVT.getSizeInBits() == 64 ||
10739 ConstraintVT.getSizeInBits() == 128))
10740 return "w";
10741
10742 return "r";
10743}
10744
10746
10747static std::optional<PredicateConstraint>
10750 .Case("Uph", PredicateConstraint::Uph)
10751 .Case("Upl", PredicateConstraint::Upl)
10752 .Case("Upa", PredicateConstraint::Upa)
10753 .Default(std::nullopt);
10754}
10755
10756static const TargetRegisterClass *
10758 if (VT != MVT::aarch64svcount &&
10759 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10760 return nullptr;
10761
10762 switch (Constraint) {
10763 case PredicateConstraint::Uph:
10764 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10765 : &AArch64::PPR_p8to15RegClass;
10766 case PredicateConstraint::Upl:
10767 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10768 : &AArch64::PPR_3bRegClass;
10769 case PredicateConstraint::Upa:
10770 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10771 : &AArch64::PPRRegClass;
10772 }
10773
10774 llvm_unreachable("Missing PredicateConstraint!");
10775}
10776
10778
10779static std::optional<ReducedGprConstraint>
10782 .Case("Uci", ReducedGprConstraint::Uci)
10783 .Case("Ucj", ReducedGprConstraint::Ucj)
10784 .Default(std::nullopt);
10785}
10786
10787static const TargetRegisterClass *
10789 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10790 return nullptr;
10791
10792 switch (Constraint) {
10793 case ReducedGprConstraint::Uci:
10794 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10795 case ReducedGprConstraint::Ucj:
10796 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10797 }
10798
10799 llvm_unreachable("Missing ReducedGprConstraint!");
10800}
10801
10802// The set of cc code supported is from
10803// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10806 .Case("{@cchi}", AArch64CC::HI)
10807 .Case("{@cccs}", AArch64CC::HS)
10808 .Case("{@cclo}", AArch64CC::LO)
10809 .Case("{@ccls}", AArch64CC::LS)
10810 .Case("{@cccc}", AArch64CC::LO)
10811 .Case("{@cceq}", AArch64CC::EQ)
10812 .Case("{@ccgt}", AArch64CC::GT)
10813 .Case("{@ccge}", AArch64CC::GE)
10814 .Case("{@cclt}", AArch64CC::LT)
10815 .Case("{@ccle}", AArch64CC::LE)
10816 .Case("{@cchs}", AArch64CC::HS)
10817 .Case("{@ccne}", AArch64CC::NE)
10818 .Case("{@ccvc}", AArch64CC::VC)
10819 .Case("{@ccpl}", AArch64CC::PL)
10820 .Case("{@ccvs}", AArch64CC::VS)
10821 .Case("{@ccmi}", AArch64CC::MI)
10823 return Cond;
10824}
10825
10826/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10827/// WZR, invert(<cond>)'.
10829 SelectionDAG &DAG) {
10830 return DAG.getNode(
10831 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10832 DAG.getConstant(0, DL, MVT::i32),
10833 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10834}
10835
10836// Lower @cc flag output via getSETCC.
10837SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10838 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10839 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10840 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10841 if (Cond == AArch64CC::Invalid)
10842 return SDValue();
10843 // The output variable should be a scalar integer.
10844 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10845 OpInfo.ConstraintVT.getSizeInBits() < 8)
10846 report_fatal_error("Flag output operand is of invalid type");
10847
10848 // Get NZCV register. Only update chain when copyfrom is glued.
10849 if (Glue.getNode()) {
10850 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10851 Chain = Glue.getValue(1);
10852 } else
10853 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10854 // Extract CC code.
10855 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10856
10858
10859 // Truncate or ZERO_EXTEND based on value types.
10860 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10861 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10862 else
10863 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10864
10865 return Result;
10866}
10867
10868/// getConstraintType - Given a constraint letter, return the type of
10869/// constraint it is for this target.
10871AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10872 if (Constraint.size() == 1) {
10873 switch (Constraint[0]) {
10874 default:
10875 break;
10876 case 'x':
10877 case 'w':
10878 case 'y':
10879 return C_RegisterClass;
10880 // An address with a single base register. Due to the way we
10881 // currently handle addresses it is the same as 'r'.
10882 case 'Q':
10883 return C_Memory;
10884 case 'I':
10885 case 'J':
10886 case 'K':
10887 case 'L':
10888 case 'M':
10889 case 'N':
10890 case 'Y':
10891 case 'Z':
10892 return C_Immediate;
10893 case 'z':
10894 case 'S': // A symbol or label reference with a constant offset
10895 return C_Other;
10896 }
10897 } else if (parsePredicateConstraint(Constraint))
10898 return C_RegisterClass;
10899 else if (parseReducedGprConstraint(Constraint))
10900 return C_RegisterClass;
10901 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10902 return C_Other;
10903 return TargetLowering::getConstraintType(Constraint);
10904}
10905
10906/// Examine constraint type and operand type and determine a weight value.
10907/// This object must already have been set up with the operand type
10908/// and the current alternative constraint selected.
10910AArch64TargetLowering::getSingleConstraintMatchWeight(
10911 AsmOperandInfo &info, const char *constraint) const {
10913 Value *CallOperandVal = info.CallOperandVal;
10914 // If we don't have a value, we can't do a match,
10915 // but allow it at the lowest weight.
10916 if (!CallOperandVal)
10917 return CW_Default;
10918 Type *type = CallOperandVal->getType();
10919 // Look at the constraint type.
10920 switch (*constraint) {
10921 default:
10923 break;
10924 case 'x':
10925 case 'w':
10926 case 'y':
10927 if (type->isFloatingPointTy() || type->isVectorTy())
10928 weight = CW_Register;
10929 break;
10930 case 'z':
10931 weight = CW_Constant;
10932 break;
10933 case 'U':
10934 if (parsePredicateConstraint(constraint) ||
10935 parseReducedGprConstraint(constraint))
10936 weight = CW_Register;
10937 break;
10938 }
10939 return weight;
10940}
10941
10942std::pair<unsigned, const TargetRegisterClass *>
10943AArch64TargetLowering::getRegForInlineAsmConstraint(
10944 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10945 if (Constraint.size() == 1) {
10946 switch (Constraint[0]) {
10947 case 'r':
10948 if (VT.isScalableVector())
10949 return std::make_pair(0U, nullptr);
10950 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10951 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10952 if (VT.getFixedSizeInBits() == 64)
10953 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10954 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10955 case 'w': {
10956 if (!Subtarget->hasFPARMv8())
10957 break;
10958 if (VT.isScalableVector()) {
10959 if (VT.getVectorElementType() != MVT::i1)
10960 return std::make_pair(0U, &AArch64::ZPRRegClass);
10961 return std::make_pair(0U, nullptr);
10962 }
10963 uint64_t VTSize = VT.getFixedSizeInBits();
10964 if (VTSize == 16)
10965 return std::make_pair(0U, &AArch64::FPR16RegClass);
10966 if (VTSize == 32)
10967 return std::make_pair(0U, &AArch64::FPR32RegClass);
10968 if (VTSize == 64)
10969 return std::make_pair(0U, &AArch64::FPR64RegClass);
10970 if (VTSize == 128)
10971 return std::make_pair(0U, &AArch64::FPR128RegClass);
10972 break;
10973 }
10974 // The instructions that this constraint is designed for can
10975 // only take 128-bit registers so just use that regclass.
10976 case 'x':
10977 if (!Subtarget->hasFPARMv8())
10978 break;
10979 if (VT.isScalableVector())
10980 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
10981 if (VT.getSizeInBits() == 128)
10982 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
10983 break;
10984 case 'y':
10985 if (!Subtarget->hasFPARMv8())
10986 break;
10987 if (VT.isScalableVector())
10988 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
10989 break;
10990 }
10991 } else {
10992 if (const auto PC = parsePredicateConstraint(Constraint))
10993 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
10994 return std::make_pair(0U, RegClass);
10995
10996 if (const auto RGC = parseReducedGprConstraint(Constraint))
10997 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
10998 return std::make_pair(0U, RegClass);
10999 }
11000 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11002 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11003
11004 if (Constraint == "{za}") {
11005 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11006 }
11007
11008 if (Constraint == "{zt0}") {
11009 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11010 }
11011
11012 // Use the default implementation in TargetLowering to convert the register
11013 // constraint into a member of a register class.
11014 std::pair<unsigned, const TargetRegisterClass *> Res;
11016
11017 // Not found as a standard register?
11018 if (!Res.second) {
11019 unsigned Size = Constraint.size();
11020 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11021 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11022 int RegNo;
11023 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11024 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11025 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11026 // By default we'll emit v0-v31 for this unless there's a modifier where
11027 // we'll emit the correct register as well.
11028 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11029 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11030 Res.second = &AArch64::FPR64RegClass;
11031 } else {
11032 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11033 Res.second = &AArch64::FPR128RegClass;
11034 }
11035 }
11036 }
11037 }
11038
11039 if (Res.second && !Subtarget->hasFPARMv8() &&
11040 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11041 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11042 return std::make_pair(0U, nullptr);
11043
11044 return Res;
11045}
11046
11048 llvm::Type *Ty,
11049 bool AllowUnknown) const {
11050 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11051 return EVT(MVT::i64x8);
11052
11053 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11054}
11055
11056/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11057/// vector. If it is invalid, don't add anything to Ops.
11058void AArch64TargetLowering::LowerAsmOperandForConstraint(
11059 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11060 SelectionDAG &DAG) const {
11061 SDValue Result;
11062
11063 // Currently only support length 1 constraints.
11064 if (Constraint.size() != 1)
11065 return;
11066
11067 char ConstraintLetter = Constraint[0];
11068 switch (ConstraintLetter) {
11069 default:
11070 break;
11071
11072 // This set of constraints deal with valid constants for various instructions.
11073 // Validate and return a target constant for them if we can.
11074 case 'z': {
11075 // 'z' maps to xzr or wzr so it needs an input of 0.
11076 if (!isNullConstant(Op))
11077 return;
11078
11079 if (Op.getValueType() == MVT::i64)
11080 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11081 else
11082 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11083 break;
11084 }
11085 case 'S':
11086 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11087 // supported for PIC while "s" isn't, making "s" less useful. We implement
11088 // "S" but not "s".
11090 break;
11091
11092 case 'I':
11093 case 'J':
11094 case 'K':
11095 case 'L':
11096 case 'M':
11097 case 'N':
11098 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11099 if (!C)
11100 return;
11101
11102 // Grab the value and do some validation.
11103 uint64_t CVal = C->getZExtValue();
11104 switch (ConstraintLetter) {
11105 // The I constraint applies only to simple ADD or SUB immediate operands:
11106 // i.e. 0 to 4095 with optional shift by 12
11107 // The J constraint applies only to ADD or SUB immediates that would be
11108 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11109 // instruction [or vice versa], in other words -1 to -4095 with optional
11110 // left shift by 12.
11111 case 'I':
11112 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11113 break;
11114 return;
11115 case 'J': {
11116 uint64_t NVal = -C->getSExtValue();
11117 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11118 CVal = C->getSExtValue();
11119 break;
11120 }
11121 return;
11122 }
11123 // The K and L constraints apply *only* to logical immediates, including
11124 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11125 // been removed and MOV should be used). So these constraints have to
11126 // distinguish between bit patterns that are valid 32-bit or 64-bit
11127 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11128 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11129 // versa.
11130 case 'K':
11131 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11132 break;
11133 return;
11134 case 'L':
11135 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11136 break;
11137 return;
11138 // The M and N constraints are a superset of K and L respectively, for use
11139 // with the MOV (immediate) alias. As well as the logical immediates they
11140 // also match 32 or 64-bit immediates that can be loaded either using a
11141 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11142 // (M) or 64-bit 0x1234000000000000 (N) etc.
11143 // As a note some of this code is liberally stolen from the asm parser.
11144 case 'M': {
11145 if (!isUInt<32>(CVal))
11146 return;
11147 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11148 break;
11149 if ((CVal & 0xFFFF) == CVal)
11150 break;
11151 if ((CVal & 0xFFFF0000ULL) == CVal)
11152 break;
11153 uint64_t NCVal = ~(uint32_t)CVal;
11154 if ((NCVal & 0xFFFFULL) == NCVal)
11155 break;
11156 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11157 break;
11158 return;
11159 }
11160 case 'N': {
11161 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11162 break;
11163 if ((CVal & 0xFFFFULL) == CVal)
11164 break;
11165 if ((CVal & 0xFFFF0000ULL) == CVal)
11166 break;
11167 if ((CVal & 0xFFFF00000000ULL) == CVal)
11168 break;
11169 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11170 break;
11171 uint64_t NCVal = ~CVal;
11172 if ((NCVal & 0xFFFFULL) == NCVal)
11173 break;
11174 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11175 break;
11176 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11177 break;
11178 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11179 break;
11180 return;
11181 }
11182 default:
11183 return;
11184 }
11185
11186 // All assembler immediates are 64-bit integers.
11187 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11188 break;
11189 }
11190
11191 if (Result.getNode()) {
11192 Ops.push_back(Result);
11193 return;
11194 }
11195
11196 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11197}
11198
11199//===----------------------------------------------------------------------===//
11200// AArch64 Advanced SIMD Support
11201//===----------------------------------------------------------------------===//
11202
11203/// WidenVector - Given a value in the V64 register class, produce the
11204/// equivalent value in the V128 register class.
11206 EVT VT = V64Reg.getValueType();
11207 unsigned NarrowSize = VT.getVectorNumElements();
11208 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11209 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11210 SDLoc DL(V64Reg);
11211
11212 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11213 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11214}
11215
11216/// getExtFactor - Determine the adjustment factor for the position when
11217/// generating an "extract from vector registers" instruction.
11218static unsigned getExtFactor(SDValue &V) {
11219 EVT EltType = V.getValueType().getVectorElementType();
11220 return EltType.getSizeInBits() / 8;
11221}
11222
11223// Check if a vector is built from one vector via extracted elements of
11224// another together with an AND mask, ensuring that all elements fit
11225// within range. This can be reconstructed using AND and NEON's TBL1.
11227 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11228 SDLoc dl(Op);
11229 EVT VT = Op.getValueType();
11230 assert(!VT.isScalableVector() &&
11231 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11232
11233 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11234 // directly to TBL1.
11235 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11236 return SDValue();
11237
11238 unsigned NumElts = VT.getVectorNumElements();
11239 assert((NumElts == 8 || NumElts == 16) &&
11240 "Need to have exactly 8 or 16 elements in vector.");
11241
11242 SDValue SourceVec;
11243 SDValue MaskSourceVec;
11244 SmallVector<SDValue, 16> AndMaskConstants;
11245
11246 for (unsigned i = 0; i < NumElts; ++i) {
11247 SDValue V = Op.getOperand(i);
11248 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11249 return SDValue();
11250
11251 SDValue OperandSourceVec = V.getOperand(0);
11252 if (!SourceVec)
11253 SourceVec = OperandSourceVec;
11254 else if (SourceVec != OperandSourceVec)
11255 return SDValue();
11256
11257 // This only looks at shuffles with elements that are
11258 // a) truncated by a constant AND mask extracted from a mask vector, or
11259 // b) extracted directly from a mask vector.
11260 SDValue MaskSource = V.getOperand(1);
11261 if (MaskSource.getOpcode() == ISD::AND) {
11262 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11263 return SDValue();
11264
11265 AndMaskConstants.push_back(MaskSource.getOperand(1));
11266 MaskSource = MaskSource->getOperand(0);
11267 } else if (!AndMaskConstants.empty()) {
11268 // Either all or no operands should have an AND mask.
11269 return SDValue();
11270 }
11271
11272 // An ANY_EXTEND may be inserted between the AND and the source vector
11273 // extraction. We don't care about that, so we can just skip it.
11274 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11275 MaskSource = MaskSource.getOperand(0);
11276
11277 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11278 return SDValue();
11279
11280 SDValue MaskIdx = MaskSource.getOperand(1);
11281 if (!isa<ConstantSDNode>(MaskIdx) ||
11282 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11283 return SDValue();
11284
11285 // We only apply this if all elements come from the same vector with the
11286 // same vector type.
11287 if (!MaskSourceVec) {
11288 MaskSourceVec = MaskSource->getOperand(0);
11289 if (MaskSourceVec.getValueType() != VT)
11290 return SDValue();
11291 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11292 return SDValue();
11293 }
11294 }
11295
11296 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11297 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11298 // insert, we know that the index in the mask must be smaller than the number
11299 // of elements in the source, or we would have an out-of-bounds access.
11300 if (NumElts == 8)
11301 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11302 DAG.getUNDEF(VT));
11303
11304 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11305 if (!AndMaskConstants.empty())
11306 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11307 DAG.getBuildVector(VT, dl, AndMaskConstants));
11308
11309 return DAG.getNode(
11311 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11312 MaskSourceVec);
11313}
11314
11315// Gather data to see if the operation can be modelled as a
11316// shuffle in combination with VEXTs.
11318 SelectionDAG &DAG) const {
11319 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11320 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11321 SDLoc dl(Op);
11322 EVT VT = Op.getValueType();
11323 assert(!VT.isScalableVector() &&
11324 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11325 unsigned NumElts = VT.getVectorNumElements();
11326
11327 struct ShuffleSourceInfo {
11328 SDValue Vec;
11329 unsigned MinElt;
11330 unsigned MaxElt;
11331
11332 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11333 // be compatible with the shuffle we intend to construct. As a result
11334 // ShuffleVec will be some sliding window into the original Vec.
11335 SDValue ShuffleVec;
11336
11337 // Code should guarantee that element i in Vec starts at element "WindowBase
11338 // + i * WindowScale in ShuffleVec".
11339 int WindowBase;
11340 int WindowScale;
11341
11342 ShuffleSourceInfo(SDValue Vec)
11343 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11344 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11345
11346 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11347 };
11348
11349 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11350 // node.
11352 for (unsigned i = 0; i < NumElts; ++i) {
11353 SDValue V = Op.getOperand(i);
11354 if (V.isUndef())
11355 continue;
11356 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11357 !isa<ConstantSDNode>(V.getOperand(1)) ||
11358 V.getOperand(0).getValueType().isScalableVector()) {
11359 LLVM_DEBUG(
11360 dbgs() << "Reshuffle failed: "
11361 "a shuffle can only come from building a vector from "
11362 "various elements of other fixed-width vectors, provided "
11363 "their indices are constant\n");
11364 return SDValue();
11365 }
11366
11367 // Add this element source to the list if it's not already there.
11368 SDValue SourceVec = V.getOperand(0);
11369 auto Source = find(Sources, SourceVec);
11370 if (Source == Sources.end())
11371 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11372
11373 // Update the minimum and maximum lane number seen.
11374 unsigned EltNo = V.getConstantOperandVal(1);
11375 Source->MinElt = std::min(Source->MinElt, EltNo);
11376 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11377 }
11378
11379 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11380 // better than moving to/from gpr registers for larger vectors.
11381 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11382 // Construct a mask for the tbl. We may need to adjust the index for types
11383 // larger than i8.
11385 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11386 for (unsigned I = 0; I < NumElts; ++I) {
11387 SDValue V = Op.getOperand(I);
11388 if (V.isUndef()) {
11389 for (unsigned OF = 0; OF < OutputFactor; OF++)
11390 Mask.push_back(-1);
11391 continue;
11392 }
11393 // Set the Mask lanes adjusted for the size of the input and output
11394 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11395 // output element, adjusted in their positions per input and output types.
11396 unsigned Lane = V.getConstantOperandVal(1);
11397 for (unsigned S = 0; S < Sources.size(); S++) {
11398 if (V.getOperand(0) == Sources[S].Vec) {
11399 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11400 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11401 for (unsigned OF = 0; OF < OutputFactor; OF++)
11402 Mask.push_back(InputBase + OF);
11403 break;
11404 }
11405 }
11406 }
11407
11408 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11409 // v16i8, and the TBLMask
11410 SmallVector<SDValue, 16> TBLOperands;
11411 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11412 ? Intrinsic::aarch64_neon_tbl3
11413 : Intrinsic::aarch64_neon_tbl4,
11414 dl, MVT::i32));
11415 for (unsigned i = 0; i < Sources.size(); i++) {
11416 SDValue Src = Sources[i].Vec;
11417 EVT SrcVT = Src.getValueType();
11418 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11419 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11420 "Expected a legally typed vector");
11421 if (SrcVT.is64BitVector())
11422 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11423 DAG.getUNDEF(MVT::v8i8));
11424 TBLOperands.push_back(Src);
11425 }
11426
11428 for (unsigned i = 0; i < Mask.size(); i++)
11429 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11430 assert((Mask.size() == 8 || Mask.size() == 16) &&
11431 "Expected a v8i8 or v16i8 Mask");
11432 TBLOperands.push_back(
11433 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11434
11435 SDValue Shuffle =
11437 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11438 return DAG.getBitcast(VT, Shuffle);
11439 }
11440
11441 if (Sources.size() > 2) {
11442 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11443 << "sensible when at most two source vectors are "
11444 << "involved\n");
11445 return SDValue();
11446 }
11447
11448 // Find out the smallest element size among result and two sources, and use
11449 // it as element size to build the shuffle_vector.
11450 EVT SmallestEltTy = VT.getVectorElementType();
11451 for (auto &Source : Sources) {
11452 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11453 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11454 SmallestEltTy = SrcEltTy;
11455 }
11456 }
11457 unsigned ResMultiplier =
11458 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11459 uint64_t VTSize = VT.getFixedSizeInBits();
11460 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11461 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11462
11463 // If the source vector is too wide or too narrow, we may nevertheless be able
11464 // to construct a compatible shuffle either by concatenating it with UNDEF or
11465 // extracting a suitable range of elements.
11466 for (auto &Src : Sources) {
11467 EVT SrcVT = Src.ShuffleVec.getValueType();
11468
11469 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11470 if (SrcVTSize == TypeSize::getFixed(VTSize))
11471 continue;
11472
11473 // This stage of the search produces a source with the same element type as
11474 // the original, but with a total width matching the BUILD_VECTOR output.
11475 EVT EltVT = SrcVT.getVectorElementType();
11476 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11477 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11478
11479 if (SrcVTSize.getFixedValue() < VTSize) {
11480 assert(2 * SrcVTSize == VTSize);
11481 // We can pad out the smaller vector for free, so if it's part of a
11482 // shuffle...
11483 Src.ShuffleVec =
11484 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11485 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11486 continue;
11487 }
11488
11489 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11490 LLVM_DEBUG(
11491 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11492 return SDValue();
11493 }
11494
11495 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11496 LLVM_DEBUG(
11497 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11498 return SDValue();
11499 }
11500
11501 if (Src.MinElt >= NumSrcElts) {
11502 // The extraction can just take the second half
11503 Src.ShuffleVec =
11504 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11505 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11506 Src.WindowBase = -NumSrcElts;
11507 } else if (Src.MaxElt < NumSrcElts) {
11508 // The extraction can just take the first half
11509 Src.ShuffleVec =
11510 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11511 DAG.getConstant(0, dl, MVT::i64));
11512 } else {
11513 // An actual VEXT is needed
11514 SDValue VEXTSrc1 =
11515 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11516 DAG.getConstant(0, dl, MVT::i64));
11517 SDValue VEXTSrc2 =
11518 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11519 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11520 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11521
11522 if (!SrcVT.is64BitVector()) {
11523 LLVM_DEBUG(
11524 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11525 "for SVE vectors.");
11526 return SDValue();
11527 }
11528
11529 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11530 VEXTSrc2,
11531 DAG.getConstant(Imm, dl, MVT::i32));
11532 Src.WindowBase = -Src.MinElt;
11533 }
11534 }
11535
11536 // Another possible incompatibility occurs from the vector element types. We
11537 // can fix this by bitcasting the source vectors to the same type we intend
11538 // for the shuffle.
11539 for (auto &Src : Sources) {
11540 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11541 if (SrcEltTy == SmallestEltTy)
11542 continue;
11543 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11544 if (DAG.getDataLayout().isBigEndian()) {
11545 Src.ShuffleVec =
11546 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11547 } else {
11548 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11549 }
11550 Src.WindowScale =
11551 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11552 Src.WindowBase *= Src.WindowScale;
11553 }
11554
11555 // Final check before we try to actually produce a shuffle.
11556 LLVM_DEBUG(for (auto Src
11557 : Sources)
11558 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11559
11560 // The stars all align, our next step is to produce the mask for the shuffle.
11561 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11562 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11563 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11564 SDValue Entry = Op.getOperand(i);
11565 if (Entry.isUndef())
11566 continue;
11567
11568 auto Src = find(Sources, Entry.getOperand(0));
11569 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11570
11571 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11572 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11573 // segment.
11574 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11575 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11576 VT.getScalarSizeInBits());
11577 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11578
11579 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11580 // starting at the appropriate offset.
11581 int *LaneMask = &Mask[i * ResMultiplier];
11582
11583 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11584 ExtractBase += NumElts * (Src - Sources.begin());
11585 for (int j = 0; j < LanesDefined; ++j)
11586 LaneMask[j] = ExtractBase + j;
11587 }
11588
11589 // Final check before we try to produce nonsense...
11590 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11591 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11592 return SDValue();
11593 }
11594
11595 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11596 for (unsigned i = 0; i < Sources.size(); ++i)
11597 ShuffleOps[i] = Sources[i].ShuffleVec;
11598
11599 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11600 ShuffleOps[1], Mask);
11601 SDValue V;
11602 if (DAG.getDataLayout().isBigEndian()) {
11603 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11604 } else {
11605 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11606 }
11607
11608 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11609 dbgs() << "Reshuffle, creating node: "; V.dump(););
11610
11611 return V;
11612}
11613
11614// check if an EXT instruction can handle the shuffle mask when the
11615// vector sources of the shuffle are the same.
11616static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11617 unsigned NumElts = VT.getVectorNumElements();
11618
11619 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11620 if (M[0] < 0)
11621 return false;
11622
11623 Imm = M[0];
11624
11625 // If this is a VEXT shuffle, the immediate value is the index of the first
11626 // element. The other shuffle indices must be the successive elements after
11627 // the first one.
11628 unsigned ExpectedElt = Imm;
11629 for (unsigned i = 1; i < NumElts; ++i) {
11630 // Increment the expected index. If it wraps around, just follow it
11631 // back to index zero and keep going.
11632 ++ExpectedElt;
11633 if (ExpectedElt == NumElts)
11634 ExpectedElt = 0;
11635
11636 if (M[i] < 0)
11637 continue; // ignore UNDEF indices
11638 if (ExpectedElt != static_cast<unsigned>(M[i]))
11639 return false;
11640 }
11641
11642 return true;
11643}
11644
11645// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11646// v4i32s. This is really a truncate, which we can construct out of (legal)
11647// concats and truncate nodes.
11649 if (V.getValueType() != MVT::v16i8)
11650 return SDValue();
11651 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11652
11653 for (unsigned X = 0; X < 4; X++) {
11654 // Check the first item in each group is an extract from lane 0 of a v4i32
11655 // or v4i16.
11656 SDValue BaseExt = V.getOperand(X * 4);
11657 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11658 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11659 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11660 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11661 BaseExt.getConstantOperandVal(1) != 0)
11662 return SDValue();
11663 SDValue Base = BaseExt.getOperand(0);
11664 // And check the other items are extracts from the same vector.
11665 for (unsigned Y = 1; Y < 4; Y++) {
11666 SDValue Ext = V.getOperand(X * 4 + Y);
11667 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11668 Ext.getOperand(0) != Base ||
11669 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11670 Ext.getConstantOperandVal(1) != Y)
11671 return SDValue();
11672 }
11673 }
11674
11675 // Turn the buildvector into a series of truncates and concates, which will
11676 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11677 // concat together to produce 2 v8i16. These are both truncated and concat
11678 // together.
11679 SDLoc DL(V);
11680 SDValue Trunc[4] = {
11681 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11682 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11683 for (SDValue &V : Trunc)
11684 if (V.getValueType() == MVT::v4i32)
11685 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11686 SDValue Concat0 =
11687 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11688 SDValue Concat1 =
11689 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11690 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11691 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11692 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11693}
11694
11695/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11696/// element width than the vector lane type. If that is the case the function
11697/// returns true and writes the value of the DUP instruction lane operand into
11698/// DupLaneOp
11699static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11700 unsigned &DupLaneOp) {
11701 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11702 "Only possible block sizes for wide DUP are: 16, 32, 64");
11703
11704 if (BlockSize <= VT.getScalarSizeInBits())
11705 return false;
11706 if (BlockSize % VT.getScalarSizeInBits() != 0)
11707 return false;
11708 if (VT.getSizeInBits() % BlockSize != 0)
11709 return false;
11710
11711 size_t SingleVecNumElements = VT.getVectorNumElements();
11712 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11713 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11714
11715 // We are looking for masks like
11716 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11717 // might be replaced by 'undefined'. BlockIndices will eventually contain
11718 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11719 // for the above examples)
11720 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11721 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11722 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11723 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11724 if (Elt < 0)
11725 continue;
11726 // For now we don't support shuffles that use the second operand
11727 if ((unsigned)Elt >= SingleVecNumElements)
11728 return false;
11729 if (BlockElts[I] < 0)
11730 BlockElts[I] = Elt;
11731 else if (BlockElts[I] != Elt)
11732 return false;
11733 }
11734
11735 // We found a candidate block (possibly with some undefs). It must be a
11736 // sequence of consecutive integers starting with a value divisible by
11737 // NumEltsPerBlock with some values possibly replaced by undef-s.
11738
11739 // Find first non-undef element
11740 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11741 assert(FirstRealEltIter != BlockElts.end() &&
11742 "Shuffle with all-undefs must have been caught by previous cases, "
11743 "e.g. isSplat()");
11744 if (FirstRealEltIter == BlockElts.end()) {
11745 DupLaneOp = 0;
11746 return true;
11747 }
11748
11749 // Index of FirstRealElt in BlockElts
11750 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11751
11752 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11753 return false;
11754 // BlockElts[0] must have the following value if it isn't undef:
11755 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11756
11757 // Check the first element
11758 if (Elt0 % NumEltsPerBlock != 0)
11759 return false;
11760 // Check that the sequence indeed consists of consecutive integers (modulo
11761 // undefs)
11762 for (size_t I = 0; I < NumEltsPerBlock; I++)
11763 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11764 return false;
11765
11766 DupLaneOp = Elt0 / NumEltsPerBlock;
11767 return true;
11768}
11769
11770// check if an EXT instruction can handle the shuffle mask when the
11771// vector sources of the shuffle are different.
11772static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11773 unsigned &Imm) {
11774 // Look for the first non-undef element.
11775 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11776
11777 // Benefit form APInt to handle overflow when calculating expected element.
11778 unsigned NumElts = VT.getVectorNumElements();
11779 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11780 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11781 // The following shuffle indices must be the successive elements after the
11782 // first real element.
11783 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11784 return Elt != ExpectedElt++ && Elt != -1;
11785 });
11786 if (FoundWrongElt)
11787 return false;
11788
11789 // The index of an EXT is the first element if it is not UNDEF.
11790 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11791 // value of the first element. E.g.
11792 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11793 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11794 // ExpectedElt is the last mask index plus 1.
11795 Imm = ExpectedElt.getZExtValue();
11796
11797 // There are two difference cases requiring to reverse input vectors.
11798 // For example, for vector <4 x i32> we have the following cases,
11799 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11800 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11801 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11802 // to reverse two input vectors.
11803 if (Imm < NumElts)
11804 ReverseEXT = true;
11805 else
11806 Imm -= NumElts;
11807
11808 return true;
11809}
11810
11811/// isREVMask - Check if a vector shuffle corresponds to a REV
11812/// instruction with the specified blocksize. (The order of the elements
11813/// within each block of the vector is reversed.)
11814static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11815 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11816 BlockSize == 128) &&
11817 "Only possible block sizes for REV are: 16, 32, 64, 128");
11818
11819 unsigned EltSz = VT.getScalarSizeInBits();
11820 unsigned NumElts = VT.getVectorNumElements();
11821 unsigned BlockElts = M[0] + 1;
11822 // If the first shuffle index is UNDEF, be optimistic.
11823 if (M[0] < 0)
11824 BlockElts = BlockSize / EltSz;
11825
11826 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11827 return false;
11828
11829 for (unsigned i = 0; i < NumElts; ++i) {
11830 if (M[i] < 0)
11831 continue; // ignore UNDEF indices
11832 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11833 return false;
11834 }
11835
11836 return true;
11837}
11838
11839static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11840 unsigned NumElts = VT.getVectorNumElements();
11841 if (NumElts % 2 != 0)
11842 return false;
11843 WhichResult = (M[0] == 0 ? 0 : 1);
11844 for (unsigned i = 0; i < NumElts; i += 2) {
11845 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11846 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11847 return false;
11848 }
11849 return true;
11850}
11851
11852/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11853/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11854/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11855static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11856 unsigned NumElts = VT.getVectorNumElements();
11857 if (NumElts % 2 != 0)
11858 return false;
11859 WhichResult = (M[0] == 0 ? 0 : 1);
11860 unsigned Idx = WhichResult * NumElts / 2;
11861 for (unsigned i = 0; i != NumElts; i += 2) {
11862 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11863 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11864 return false;
11865 Idx += 1;
11866 }
11867
11868 return true;
11869}
11870
11871/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11872/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11873/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11874static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11875 unsigned Half = VT.getVectorNumElements() / 2;
11876 WhichResult = (M[0] == 0 ? 0 : 1);
11877 for (unsigned j = 0; j != 2; ++j) {
11878 unsigned Idx = WhichResult;
11879 for (unsigned i = 0; i != Half; ++i) {
11880 int MIdx = M[i + j * Half];
11881 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11882 return false;
11883 Idx += 2;
11884 }
11885 }
11886
11887 return true;
11888}
11889
11890/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11891/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11892/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11893static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11894 unsigned NumElts = VT.getVectorNumElements();
11895 if (NumElts % 2 != 0)
11896 return false;
11897 WhichResult = (M[0] == 0 ? 0 : 1);
11898 for (unsigned i = 0; i < NumElts; i += 2) {
11899 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11900 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11901 return false;
11902 }
11903 return true;
11904}
11905
11906static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11907 bool &DstIsLeft, int &Anomaly) {
11908 if (M.size() != static_cast<size_t>(NumInputElements))
11909 return false;
11910
11911 int NumLHSMatch = 0, NumRHSMatch = 0;
11912 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11913
11914 for (int i = 0; i < NumInputElements; ++i) {
11915 if (M[i] == -1) {
11916 ++NumLHSMatch;
11917 ++NumRHSMatch;
11918 continue;
11919 }
11920
11921 if (M[i] == i)
11922 ++NumLHSMatch;
11923 else
11924 LastLHSMismatch = i;
11925
11926 if (M[i] == i + NumInputElements)
11927 ++NumRHSMatch;
11928 else
11929 LastRHSMismatch = i;
11930 }
11931
11932 if (NumLHSMatch == NumInputElements - 1) {
11933 DstIsLeft = true;
11934 Anomaly = LastLHSMismatch;
11935 return true;
11936 } else if (NumRHSMatch == NumInputElements - 1) {
11937 DstIsLeft = false;
11938 Anomaly = LastRHSMismatch;
11939 return true;
11940 }
11941
11942 return false;
11943}
11944
11945static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
11946 if (VT.getSizeInBits() != 128)
11947 return false;
11948
11949 unsigned NumElts = VT.getVectorNumElements();
11950
11951 for (int I = 0, E = NumElts / 2; I != E; I++) {
11952 if (Mask[I] != I)
11953 return false;
11954 }
11955
11956 int Offset = NumElts / 2;
11957 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
11958 if (Mask[I] != I + SplitLHS * Offset)
11959 return false;
11960 }
11961
11962 return true;
11963}
11964
11966 SDLoc DL(Op);
11967 EVT VT = Op.getValueType();
11968 SDValue V0 = Op.getOperand(0);
11969 SDValue V1 = Op.getOperand(1);
11970 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
11971
11974 return SDValue();
11975
11976 bool SplitV0 = V0.getValueSizeInBits() == 128;
11977
11978 if (!isConcatMask(Mask, VT, SplitV0))
11979 return SDValue();
11980
11981 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
11982 if (SplitV0) {
11983 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
11984 DAG.getConstant(0, DL, MVT::i64));
11985 }
11986 if (V1.getValueSizeInBits() == 128) {
11987 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
11988 DAG.getConstant(0, DL, MVT::i64));
11989 }
11990 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
11991}
11992
11993/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
11994/// the specified operations to build the shuffle. ID is the perfect-shuffle
11995//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
11996//table entry and LHS/RHS are the immediate inputs for this stage of the
11997//shuffle.
11999 SDValue V2, unsigned PFEntry, SDValue LHS,
12000 SDValue RHS, SelectionDAG &DAG,
12001 const SDLoc &dl) {
12002 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12003 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12004 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12005
12006 enum {
12007 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12008 OP_VREV,
12009 OP_VDUP0,
12010 OP_VDUP1,
12011 OP_VDUP2,
12012 OP_VDUP3,
12013 OP_VEXT1,
12014 OP_VEXT2,
12015 OP_VEXT3,
12016 OP_VUZPL, // VUZP, left result
12017 OP_VUZPR, // VUZP, right result
12018 OP_VZIPL, // VZIP, left result
12019 OP_VZIPR, // VZIP, right result
12020 OP_VTRNL, // VTRN, left result
12021 OP_VTRNR, // VTRN, right result
12022 OP_MOVLANE // Move lane. RHSID is the lane to move into
12023 };
12024
12025 if (OpNum == OP_COPY) {
12026 if (LHSID == (1 * 9 + 2) * 9 + 3)
12027 return LHS;
12028 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12029 return RHS;
12030 }
12031
12032 if (OpNum == OP_MOVLANE) {
12033 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12034 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12035 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12036 Elt = 3 - Elt;
12037 while (Elt > 0) {
12038 ID /= 9;
12039 Elt--;
12040 }
12041 return (ID % 9 == 8) ? -1 : ID % 9;
12042 };
12043
12044 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12045 // get the lane to move from the PFID, which is always from the
12046 // original vectors (V1 or V2).
12048 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12049 EVT VT = OpLHS.getValueType();
12050 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12051 unsigned ExtLane = 0;
12052 SDValue Input;
12053
12054 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12055 // convert into a higher type.
12056 if (RHSID & 0x4) {
12057 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12058 if (MaskElt == -1)
12059 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12060 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12061 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12062 Input = MaskElt < 2 ? V1 : V2;
12063 if (VT.getScalarSizeInBits() == 16) {
12064 Input = DAG.getBitcast(MVT::v2f32, Input);
12065 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12066 } else {
12067 assert(VT.getScalarSizeInBits() == 32 &&
12068 "Expected 16 or 32 bit shuffle elemements");
12069 Input = DAG.getBitcast(MVT::v2f64, Input);
12070 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12071 }
12072 } else {
12073 int MaskElt = getPFIDLane(ID, RHSID);
12074 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12075 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12076 Input = MaskElt < 4 ? V1 : V2;
12077 // Be careful about creating illegal types. Use f16 instead of i16.
12078 if (VT == MVT::v4i16) {
12079 Input = DAG.getBitcast(MVT::v4f16, Input);
12080 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12081 }
12082 }
12085 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12086 SDValue Ins =
12087 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12088 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12089 return DAG.getBitcast(VT, Ins);
12090 }
12091
12092 SDValue OpLHS, OpRHS;
12093 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12094 RHS, DAG, dl);
12095 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12096 RHS, DAG, dl);
12097 EVT VT = OpLHS.getValueType();
12098
12099 switch (OpNum) {
12100 default:
12101 llvm_unreachable("Unknown shuffle opcode!");
12102 case OP_VREV:
12103 // VREV divides the vector in half and swaps within the half.
12104 if (VT.getVectorElementType() == MVT::i32 ||
12105 VT.getVectorElementType() == MVT::f32)
12106 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12107 // vrev <4 x i16> -> REV32
12108 if (VT.getVectorElementType() == MVT::i16 ||
12109 VT.getVectorElementType() == MVT::f16 ||
12110 VT.getVectorElementType() == MVT::bf16)
12111 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12112 // vrev <4 x i8> -> REV16
12113 assert(VT.getVectorElementType() == MVT::i8);
12114 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12115 case OP_VDUP0:
12116 case OP_VDUP1:
12117 case OP_VDUP2:
12118 case OP_VDUP3: {
12119 EVT EltTy = VT.getVectorElementType();
12120 unsigned Opcode;
12121 if (EltTy == MVT::i8)
12122 Opcode = AArch64ISD::DUPLANE8;
12123 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12124 Opcode = AArch64ISD::DUPLANE16;
12125 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12126 Opcode = AArch64ISD::DUPLANE32;
12127 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12128 Opcode = AArch64ISD::DUPLANE64;
12129 else
12130 llvm_unreachable("Invalid vector element type?");
12131
12132 if (VT.getSizeInBits() == 64)
12133 OpLHS = WidenVector(OpLHS, DAG);
12134 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12135 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12136 }
12137 case OP_VEXT1:
12138 case OP_VEXT2:
12139 case OP_VEXT3: {
12140 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12141 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12142 DAG.getConstant(Imm, dl, MVT::i32));
12143 }
12144 case OP_VUZPL:
12145 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12146 case OP_VUZPR:
12147 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12148 case OP_VZIPL:
12149 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12150 case OP_VZIPR:
12151 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12152 case OP_VTRNL:
12153 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12154 case OP_VTRNR:
12155 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12156 }
12157}
12158
12160 SelectionDAG &DAG) {
12161 // Check to see if we can use the TBL instruction.
12162 SDValue V1 = Op.getOperand(0);
12163 SDValue V2 = Op.getOperand(1);
12164 SDLoc DL(Op);
12165
12166 EVT EltVT = Op.getValueType().getVectorElementType();
12167 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12168
12169 bool Swap = false;
12170 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12171 std::swap(V1, V2);
12172 Swap = true;
12173 }
12174
12175 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12176 // out of range values with 0s. We do need to make sure that any out-of-range
12177 // values are really out-of-range for a v16i8 vector.
12178 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12179 MVT IndexVT = MVT::v8i8;
12180 unsigned IndexLen = 8;
12181 if (Op.getValueSizeInBits() == 128) {
12182 IndexVT = MVT::v16i8;
12183 IndexLen = 16;
12184 }
12185
12187 for (int Val : ShuffleMask) {
12188 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12189 unsigned Offset = Byte + Val * BytesPerElt;
12190 if (Swap)
12191 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12192 if (IsUndefOrZero && Offset >= IndexLen)
12193 Offset = 255;
12194 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12195 }
12196 }
12197
12198 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12199 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12200
12201 SDValue Shuffle;
12202 if (IsUndefOrZero) {
12203 if (IndexLen == 8)
12204 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12205 Shuffle = DAG.getNode(
12206 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12207 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12208 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12209 } else {
12210 if (IndexLen == 8) {
12211 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12212 Shuffle = DAG.getNode(
12213 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12214 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12215 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12216 } else {
12217 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12218 // cannot currently represent the register constraints on the input
12219 // table registers.
12220 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12221 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12222 // IndexLen));
12223 Shuffle = DAG.getNode(
12224 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12225 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12226 V2Cst,
12227 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12228 }
12229 }
12230 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12231}
12232
12233static unsigned getDUPLANEOp(EVT EltType) {
12234 if (EltType == MVT::i8)
12235 return AArch64ISD::DUPLANE8;
12236 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12237 return AArch64ISD::DUPLANE16;
12238 if (EltType == MVT::i32 || EltType == MVT::f32)
12239 return AArch64ISD::DUPLANE32;
12240 if (EltType == MVT::i64 || EltType == MVT::f64)
12241 return AArch64ISD::DUPLANE64;
12242
12243 llvm_unreachable("Invalid vector element type?");
12244}
12245
12246static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12247 unsigned Opcode, SelectionDAG &DAG) {
12248 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12249 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12250 // Match: dup (bitcast (extract_subv X, C)), LaneC
12251 if (BitCast.getOpcode() != ISD::BITCAST ||
12253 return false;
12254
12255 // The extract index must align in the destination type. That may not
12256 // happen if the bitcast is from narrow to wide type.
12257 SDValue Extract = BitCast.getOperand(0);
12258 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12259 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12260 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12261 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12262 if (ExtIdxInBits % CastedEltBitWidth != 0)
12263 return false;
12264
12265 // Can't handle cases where vector size is not 128-bit
12266 if (!Extract.getOperand(0).getValueType().is128BitVector())
12267 return false;
12268
12269 // Update the lane value by offsetting with the scaled extract index.
12270 LaneC += ExtIdxInBits / CastedEltBitWidth;
12271
12272 // Determine the casted vector type of the wide vector input.
12273 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12274 // Examples:
12275 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12276 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12277 unsigned SrcVecNumElts =
12278 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12280 SrcVecNumElts);
12281 return true;
12282 };
12283 MVT CastVT;
12284 if (getScaledOffsetDup(V, Lane, CastVT)) {
12285 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12286 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12287 V.getOperand(0).getValueType().is128BitVector()) {
12288 // The lane is incremented by the index of the extract.
12289 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12290 Lane += V.getConstantOperandVal(1);
12291 V = V.getOperand(0);
12292 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12293 // The lane is decremented if we are splatting from the 2nd operand.
12294 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12295 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12296 Lane -= Idx * VT.getVectorNumElements() / 2;
12297 V = WidenVector(V.getOperand(Idx), DAG);
12298 } else if (VT.getSizeInBits() == 64) {
12299 // Widen the operand to 128-bit register with undef.
12300 V = WidenVector(V, DAG);
12301 }
12302 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12303}
12304
12305// Return true if we can get a new shuffle mask by checking the parameter mask
12306// array to test whether every two adjacent mask values are continuous and
12307// starting from an even number.
12309 SmallVectorImpl<int> &NewMask) {
12310 unsigned NumElts = VT.getVectorNumElements();
12311 if (NumElts % 2 != 0)
12312 return false;
12313
12314 NewMask.clear();
12315 for (unsigned i = 0; i < NumElts; i += 2) {
12316 int M0 = M[i];
12317 int M1 = M[i + 1];
12318
12319 // If both elements are undef, new mask is undef too.
12320 if (M0 == -1 && M1 == -1) {
12321 NewMask.push_back(-1);
12322 continue;
12323 }
12324
12325 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12326 NewMask.push_back(M1 / 2);
12327 continue;
12328 }
12329
12330 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12331 NewMask.push_back(M0 / 2);
12332 continue;
12333 }
12334
12335 NewMask.clear();
12336 return false;
12337 }
12338
12339 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12340 return true;
12341}
12342
12343// Try to widen element type to get a new mask value for a better permutation
12344// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12345// UZP1/2, TRN1/2, REV, INS, etc.
12346// For example:
12347// shufflevector <4 x i32> %a, <4 x i32> %b,
12348// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12349// is equivalent to:
12350// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12351// Finally, we can get:
12352// mov v0.d[0], v1.d[1]
12354 SDLoc DL(Op);
12355 EVT VT = Op.getValueType();
12356 EVT ScalarVT = VT.getVectorElementType();
12357 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12358 SDValue V0 = Op.getOperand(0);
12359 SDValue V1 = Op.getOperand(1);
12360 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12361
12362 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12363 // We need to make sure the wider element type is legal. Thus, ElementSize
12364 // should be not larger than 32 bits, and i1 type should also be excluded.
12365 if (ElementSize > 32 || ElementSize == 1)
12366 return SDValue();
12367
12368 SmallVector<int, 8> NewMask;
12369 if (isWideTypeMask(Mask, VT, NewMask)) {
12370 MVT NewEltVT = VT.isFloatingPoint()
12371 ? MVT::getFloatingPointVT(ElementSize * 2)
12372 : MVT::getIntegerVT(ElementSize * 2);
12373 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12374 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12375 V0 = DAG.getBitcast(NewVT, V0);
12376 V1 = DAG.getBitcast(NewVT, V1);
12377 return DAG.getBitcast(VT,
12378 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12379 }
12380 }
12381
12382 return SDValue();
12383}
12384
12385// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12387 ArrayRef<int> ShuffleMask,
12388 SelectionDAG &DAG) {
12389 SDValue Tbl1 = Op->getOperand(0);
12390 SDValue Tbl2 = Op->getOperand(1);
12391 SDLoc dl(Op);
12392 SDValue Tbl2ID =
12393 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12394
12395 EVT VT = Op.getValueType();
12396 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12397 Tbl1->getOperand(0) != Tbl2ID ||
12399 Tbl2->getOperand(0) != Tbl2ID)
12400 return SDValue();
12401
12402 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12403 Tbl2->getValueType(0) != MVT::v16i8)
12404 return SDValue();
12405
12406 SDValue Mask1 = Tbl1->getOperand(3);
12407 SDValue Mask2 = Tbl2->getOperand(3);
12408 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12409 for (unsigned I = 0; I < 16; I++) {
12410 if (ShuffleMask[I] < 16)
12411 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12412 else {
12413 auto *C =
12414 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12415 if (!C)
12416 return SDValue();
12417 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12418 }
12419 }
12420
12421 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12422 SDValue ID =
12423 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12424
12425 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12426 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12427 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12428}
12429
12430// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12431// but we don't have an appropriate instruction,
12432// so custom-lower it as ZIP1-with-zeros.
12433SDValue
12434AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12435 SelectionDAG &DAG) const {
12436 SDLoc dl(Op);
12437 EVT VT = Op.getValueType();
12438 SDValue SrcOp = Op.getOperand(0);
12439 EVT SrcVT = SrcOp.getValueType();
12440 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12441 "Unexpected extension factor.");
12442 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12443 // FIXME: support multi-step zipping?
12444 if (Scale != 2)
12445 return SDValue();
12446 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12447 return DAG.getBitcast(VT,
12448 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12449}
12450
12451SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12452 SelectionDAG &DAG) const {
12453 SDLoc dl(Op);
12454 EVT VT = Op.getValueType();
12455
12456 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12457
12458 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12459 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12460
12461 // Convert shuffles that are directly supported on NEON to target-specific
12462 // DAG nodes, instead of keeping them as shuffles and matching them again
12463 // during code selection. This is more efficient and avoids the possibility
12464 // of inconsistencies between legalization and selection.
12465 ArrayRef<int> ShuffleMask = SVN->getMask();
12466
12467 SDValue V1 = Op.getOperand(0);
12468 SDValue V2 = Op.getOperand(1);
12469
12470 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12471 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12472 "Unexpected VECTOR_SHUFFLE mask size!");
12473
12474 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12475 return Res;
12476
12477 if (SVN->isSplat()) {
12478 int Lane = SVN->getSplatIndex();
12479 // If this is undef splat, generate it via "just" vdup, if possible.
12480 if (Lane == -1)
12481 Lane = 0;
12482
12483 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12484 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12485 V1.getOperand(0));
12486 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12487 // constant. If so, we can just reference the lane's definition directly.
12488 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12489 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12490 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12491
12492 // Otherwise, duplicate from the lane of the input vector.
12493 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12494 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12495 }
12496
12497 // Check if the mask matches a DUP for a wider element
12498 for (unsigned LaneSize : {64U, 32U, 16U}) {
12499 unsigned Lane = 0;
12500 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12501 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12502 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12504 // Cast V1 to an integer vector with required lane size
12505 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12506 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12507 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12508 V1 = DAG.getBitcast(NewVecTy, V1);
12509 // Constuct the DUP instruction
12510 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12511 // Cast back to the original type
12512 return DAG.getBitcast(VT, V1);
12513 }
12514 }
12515
12516 if (isREVMask(ShuffleMask, VT, 64))
12517 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12518 if (isREVMask(ShuffleMask, VT, 32))
12519 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12520 if (isREVMask(ShuffleMask, VT, 16))
12521 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12522
12523 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12524 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12525 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12526 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12527 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12528 DAG.getConstant(8, dl, MVT::i32));
12529 }
12530
12531 bool ReverseEXT = false;
12532 unsigned Imm;
12533 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12534 if (ReverseEXT)
12535 std::swap(V1, V2);
12536 Imm *= getExtFactor(V1);
12537 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12538 DAG.getConstant(Imm, dl, MVT::i32));
12539 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12540 Imm *= getExtFactor(V1);
12541 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12542 DAG.getConstant(Imm, dl, MVT::i32));
12543 }
12544
12545 unsigned WhichResult;
12546 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12547 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12548 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12549 }
12550 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12551 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12552 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12553 }
12554 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12555 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12556 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12557 }
12558
12559 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12560 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12561 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12562 }
12563 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12564 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12565 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12566 }
12567 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12568 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12569 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12570 }
12571
12573 return Concat;
12574
12575 bool DstIsLeft;
12576 int Anomaly;
12577 int NumInputElements = V1.getValueType().getVectorNumElements();
12578 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12579 SDValue DstVec = DstIsLeft ? V1 : V2;
12580 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12581
12582 SDValue SrcVec = V1;
12583 int SrcLane = ShuffleMask[Anomaly];
12584 if (SrcLane >= NumInputElements) {
12585 SrcVec = V2;
12586 SrcLane -= VT.getVectorNumElements();
12587 }
12588 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12589
12590 EVT ScalarVT = VT.getVectorElementType();
12591
12592 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12593 ScalarVT = MVT::i32;
12594
12595 return DAG.getNode(
12596 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12597 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12598 DstLaneV);
12599 }
12600
12601 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12602 return NewSD;
12603
12604 // If the shuffle is not directly supported and it has 4 elements, use
12605 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12606 unsigned NumElts = VT.getVectorNumElements();
12607 if (NumElts == 4) {
12608 unsigned PFIndexes[4];
12609 for (unsigned i = 0; i != 4; ++i) {
12610 if (ShuffleMask[i] < 0)
12611 PFIndexes[i] = 8;
12612 else
12613 PFIndexes[i] = ShuffleMask[i];
12614 }
12615
12616 // Compute the index in the perfect shuffle table.
12617 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12618 PFIndexes[2] * 9 + PFIndexes[3];
12619 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12620 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12621 dl);
12622 }
12623
12624 return GenerateTBL(Op, ShuffleMask, DAG);
12625}
12626
12627SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12628 SelectionDAG &DAG) const {
12629 EVT VT = Op.getValueType();
12630
12631 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12632 return LowerToScalableOp(Op, DAG);
12633
12634 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12635 "Unexpected vector type!");
12636
12637 // We can handle the constant cases during isel.
12638 if (isa<ConstantSDNode>(Op.getOperand(0)))
12639 return Op;
12640
12641 // There isn't a natural way to handle the general i1 case, so we use some
12642 // trickery with whilelo.
12643 SDLoc DL(Op);
12644 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12645 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12646 DAG.getValueType(MVT::i1));
12647 SDValue ID =
12648 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12649 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12650 if (VT == MVT::nxv1i1)
12651 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12652 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12653 Zero, SplatVal),
12654 Zero);
12655 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12656}
12657
12658SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12659 SelectionDAG &DAG) const {
12660 SDLoc DL(Op);
12661
12662 EVT VT = Op.getValueType();
12663 if (!isTypeLegal(VT) || !VT.isScalableVector())
12664 return SDValue();
12665
12666 // Current lowering only supports the SVE-ACLE types.
12668 return SDValue();
12669
12670 // The DUPQ operation is indepedent of element type so normalise to i64s.
12671 SDValue Idx128 = Op.getOperand(2);
12672
12673 // DUPQ can be used when idx is in range.
12674 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12675 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12676 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12677 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12678 }
12679
12680 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12681
12682 // The ACLE says this must produce the same result as:
12683 // svtbl(data, svadd_x(svptrue_b64(),
12684 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12685 // index * 2))
12686 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12687 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12688
12689 // create the vector 0,1,0,1,...
12690 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12691 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12692
12693 // create the vector idx64,idx64+1,idx64,idx64+1,...
12694 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12695 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12696 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12697
12698 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12699 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12700 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12701}
12702
12703
12704static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12705 APInt &UndefBits) {
12706 EVT VT = BVN->getValueType(0);
12707 APInt SplatBits, SplatUndef;
12708 unsigned SplatBitSize;
12709 bool HasAnyUndefs;
12710 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12711 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12712
12713 for (unsigned i = 0; i < NumSplats; ++i) {
12714 CnstBits <<= SplatBitSize;
12715 UndefBits <<= SplatBitSize;
12716 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12717 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12718 }
12719
12720 return true;
12721 }
12722
12723 return false;
12724}
12725
12726// Try 64-bit splatted SIMD immediate.
12727static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12728 const APInt &Bits) {
12729 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12730 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12731 EVT VT = Op.getValueType();
12732 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12733
12736
12737 SDLoc dl(Op);
12738 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12739 DAG.getConstant(Value, dl, MVT::i32));
12740 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12741 }
12742 }
12743
12744 return SDValue();
12745}
12746
12747// Try 32-bit splatted SIMD immediate.
12748static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12749 const APInt &Bits,
12750 const SDValue *LHS = nullptr) {
12751 EVT VT = Op.getValueType();
12752 if (VT.isFixedLengthVector() &&
12754 return SDValue();
12755
12756 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12757 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12758 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12759 bool isAdvSIMDModImm = false;
12760 uint64_t Shift;
12761
12762 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12764 Shift = 0;
12765 }
12766 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12768 Shift = 8;
12769 }
12770 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12772 Shift = 16;
12773 }
12774 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12776 Shift = 24;
12777 }
12778
12779 if (isAdvSIMDModImm) {
12780 SDLoc dl(Op);
12781 SDValue Mov;
12782
12783 if (LHS)
12784 Mov = DAG.getNode(NewOp, dl, MovTy,
12785 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12786 DAG.getConstant(Value, dl, MVT::i32),
12787 DAG.getConstant(Shift, dl, MVT::i32));
12788 else
12789 Mov = DAG.getNode(NewOp, dl, MovTy,
12790 DAG.getConstant(Value, dl, MVT::i32),
12791 DAG.getConstant(Shift, dl, MVT::i32));
12792
12793 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12794 }
12795 }
12796
12797 return SDValue();
12798}
12799
12800// Try 16-bit splatted SIMD immediate.
12801static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12802 const APInt &Bits,
12803 const SDValue *LHS = nullptr) {
12804 EVT VT = Op.getValueType();
12805 if (VT.isFixedLengthVector() &&
12807 return SDValue();
12808
12809 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12810 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12811 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12812 bool isAdvSIMDModImm = false;
12813 uint64_t Shift;
12814
12815 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12817 Shift = 0;
12818 }
12819 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12821 Shift = 8;
12822 }
12823
12824 if (isAdvSIMDModImm) {
12825 SDLoc dl(Op);
12826 SDValue Mov;
12827
12828 if (LHS)
12829 Mov = DAG.getNode(NewOp, dl, MovTy,
12830 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12831 DAG.getConstant(Value, dl, MVT::i32),
12832 DAG.getConstant(Shift, dl, MVT::i32));
12833 else
12834 Mov = DAG.getNode(NewOp, dl, MovTy,
12835 DAG.getConstant(Value, dl, MVT::i32),
12836 DAG.getConstant(Shift, dl, MVT::i32));
12837
12838 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12839 }
12840 }
12841
12842 return SDValue();
12843}
12844
12845// Try 32-bit splatted SIMD immediate with shifted ones.
12847 SelectionDAG &DAG, const APInt &Bits) {
12848 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12849 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12850 EVT VT = Op.getValueType();
12851 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12852 bool isAdvSIMDModImm = false;
12853 uint64_t Shift;
12854
12855 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12857 Shift = 264;
12858 }
12859 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12861 Shift = 272;
12862 }
12863
12864 if (isAdvSIMDModImm) {
12865 SDLoc dl(Op);
12866 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12867 DAG.getConstant(Value, dl, MVT::i32),
12868 DAG.getConstant(Shift, dl, MVT::i32));
12869 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12870 }
12871 }
12872
12873 return SDValue();
12874}
12875
12876// Try 8-bit splatted SIMD immediate.
12877static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12878 const APInt &Bits) {
12879 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12880 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12881 EVT VT = Op.getValueType();
12882 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12883
12886
12887 SDLoc dl(Op);
12888 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12889 DAG.getConstant(Value, dl, MVT::i32));
12890 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12891 }
12892 }
12893
12894 return SDValue();
12895}
12896
12897// Try FP splatted SIMD immediate.
12898static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12899 const APInt &Bits) {
12900 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12901 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12902 EVT VT = Op.getValueType();
12903 bool isWide = (VT.getSizeInBits() == 128);
12904 MVT MovTy;
12905 bool isAdvSIMDModImm = false;
12906
12907 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12909 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12910 }
12911 else if (isWide &&
12912 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12914 MovTy = MVT::v2f64;
12915 }
12916
12917 if (isAdvSIMDModImm) {
12918 SDLoc dl(Op);
12919 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12920 DAG.getConstant(Value, dl, MVT::i32));
12921 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12922 }
12923 }
12924
12925 return SDValue();
12926}
12927
12928// Specialized code to quickly find if PotentialBVec is a BuildVector that
12929// consists of only the same constant int value, returned in reference arg
12930// ConstVal
12931static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12932 uint64_t &ConstVal) {
12933 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
12934 if (!Bvec)
12935 return false;
12936 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
12937 if (!FirstElt)
12938 return false;
12939 EVT VT = Bvec->getValueType(0);
12940 unsigned NumElts = VT.getVectorNumElements();
12941 for (unsigned i = 1; i < NumElts; ++i)
12942 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
12943 return false;
12944 ConstVal = FirstElt->getZExtValue();
12945 return true;
12946}
12947
12949 // Look through cast.
12950 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
12951 N = N.getOperand(0);
12952
12953 return ISD::isConstantSplatVectorAllZeros(N.getNode());
12954}
12955
12957 unsigned NumElts = N.getValueType().getVectorMinNumElements();
12958
12959 // Look through cast.
12960 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
12961 N = N.getOperand(0);
12962 // When reinterpreting from a type with fewer elements the "new" elements
12963 // are not active, so bail if they're likely to be used.
12964 if (N.getValueType().getVectorMinNumElements() < NumElts)
12965 return false;
12966 }
12967
12968 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
12969 return true;
12970
12971 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
12972 // or smaller than the implicit element type represented by N.
12973 // NOTE: A larger element count implies a smaller element type.
12974 if (N.getOpcode() == AArch64ISD::PTRUE &&
12975 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
12976 return N.getValueType().getVectorMinNumElements() >= NumElts;
12977
12978 // If we're compiling for a specific vector-length, we can check if the
12979 // pattern's VL equals that of the scalable vector at runtime.
12980 if (N.getOpcode() == AArch64ISD::PTRUE) {
12981 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12982 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
12983 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
12984 if (MaxSVESize && MinSVESize == MaxSVESize) {
12985 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
12986 unsigned PatNumElts =
12987 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
12988 return PatNumElts == (NumElts * VScale);
12989 }
12990 }
12991
12992 return false;
12993}
12994
12995// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
12996// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
12997// BUILD_VECTORs with constant element C1, C2 is a constant, and:
12998// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
12999// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13000// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13002 EVT VT = N->getValueType(0);
13003
13004 if (!VT.isVector())
13005 return SDValue();
13006
13007 SDLoc DL(N);
13008
13009 SDValue And;
13010 SDValue Shift;
13011
13012 SDValue FirstOp = N->getOperand(0);
13013 unsigned FirstOpc = FirstOp.getOpcode();
13014 SDValue SecondOp = N->getOperand(1);
13015 unsigned SecondOpc = SecondOp.getOpcode();
13016
13017 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13018 // a BICi in order to use an immediate instead of a register.
13019 // Is the other operand an shl or lshr? This will have been turned into:
13020 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13021 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13022 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13023 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13024 SecondOpc == AArch64ISD::SHL_PRED ||
13025 SecondOpc == AArch64ISD::SRL_PRED)) {
13026 And = FirstOp;
13027 Shift = SecondOp;
13028
13029 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13030 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13031 FirstOpc == AArch64ISD::SHL_PRED ||
13032 FirstOpc == AArch64ISD::SRL_PRED)) {
13033 And = SecondOp;
13034 Shift = FirstOp;
13035 } else
13036 return SDValue();
13037
13038 bool IsAnd = And.getOpcode() == ISD::AND;
13039 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13041 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13043
13044 // Is the shift amount constant and are all lanes active?
13045 uint64_t C2;
13046 if (ShiftHasPredOp) {
13047 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13048 return SDValue();
13049 APInt C;
13051 return SDValue();
13052 C2 = C.getZExtValue();
13053 } else if (ConstantSDNode *C2node =
13054 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13055 C2 = C2node->getZExtValue();
13056 else
13057 return SDValue();
13058
13059 APInt C1AsAPInt;
13060 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13061 if (IsAnd) {
13062 // Is the and mask vector all constant?
13063 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13064 return SDValue();
13065 } else {
13066 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13067 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13068 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13069 assert(C1nodeImm && C1nodeShift);
13070 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13071 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13072 }
13073
13074 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13075 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13076 // how much one can shift elements of a particular size?
13077 if (C2 > ElemSizeInBits)
13078 return SDValue();
13079
13080 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13081 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13082 if (C1AsAPInt != RequiredC1)
13083 return SDValue();
13084
13085 SDValue X = And.getOperand(0);
13086 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13087 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13088 : Shift.getOperand(1);
13089
13090 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13091 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13092
13093 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13094 LLVM_DEBUG(N->dump(&DAG));
13095 LLVM_DEBUG(dbgs() << "into: \n");
13096 LLVM_DEBUG(ResultSLI->dump(&DAG));
13097
13098 ++NumShiftInserts;
13099 return ResultSLI;
13100}
13101
13102SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13103 SelectionDAG &DAG) const {
13104 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13105 !Subtarget->isNeonAvailable()))
13106 return LowerToScalableOp(Op, DAG);
13107
13108 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13109 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13110 return Res;
13111
13112 EVT VT = Op.getValueType();
13113 if (VT.isScalableVector())
13114 return Op;
13115
13116 SDValue LHS = Op.getOperand(0);
13117 BuildVectorSDNode *BVN =
13118 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13119 if (!BVN) {
13120 // OR commutes, so try swapping the operands.
13121 LHS = Op.getOperand(1);
13122 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13123 }
13124 if (!BVN)
13125 return Op;
13126
13127 APInt DefBits(VT.getSizeInBits(), 0);
13128 APInt UndefBits(VT.getSizeInBits(), 0);
13129 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13130 SDValue NewOp;
13131
13132 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13133 DefBits, &LHS)) ||
13134 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13135 DefBits, &LHS)))
13136 return NewOp;
13137
13138 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13139 UndefBits, &LHS)) ||
13140 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13141 UndefBits, &LHS)))
13142 return NewOp;
13143 }
13144
13145 // We can always fall back to a non-immediate OR.
13146 return Op;
13147}
13148
13149// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13150// be truncated to fit element width.
13152 SelectionDAG &DAG) {
13153 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13154 SDLoc dl(Op);
13155 EVT VT = Op.getValueType();
13156 EVT EltTy= VT.getVectorElementType();
13157
13158 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13159 return Op;
13160
13162 for (SDValue Lane : Op->ops()) {
13163 // For integer vectors, type legalization would have promoted the
13164 // operands already. Otherwise, if Op is a floating-point splat
13165 // (with operands cast to integers), then the only possibilities
13166 // are constants and UNDEFs.
13167 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13168 APInt LowBits(EltTy.getSizeInBits(),
13169 CstLane->getZExtValue());
13170 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13171 } else if (Lane.getNode()->isUndef()) {
13172 Lane = DAG.getUNDEF(MVT::i32);
13173 } else {
13174 assert(Lane.getValueType() == MVT::i32 &&
13175 "Unexpected BUILD_VECTOR operand type");
13176 }
13177 Ops.push_back(Lane);
13178 }
13179 return DAG.getBuildVector(VT, dl, Ops);
13180}
13181
13183 const AArch64Subtarget *ST) {
13184 EVT VT = Op.getValueType();
13185 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13186 "Expected a legal NEON vector");
13187
13188 APInt DefBits(VT.getSizeInBits(), 0);
13189 APInt UndefBits(VT.getSizeInBits(), 0);
13190 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13191 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13192 auto TryMOVIWithBits = [&](APInt DefBits) {
13193 SDValue NewOp;
13194 if ((NewOp =
13195 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13196 (NewOp =
13197 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13198 (NewOp =
13199 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13200 (NewOp =
13201 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13202 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13203 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13204 return NewOp;
13205
13206 APInt NotDefBits = ~DefBits;
13207 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13208 NotDefBits)) ||
13210 NotDefBits)) ||
13211 (NewOp =
13212 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13213 return NewOp;
13214 return SDValue();
13215 };
13216 if (SDValue R = TryMOVIWithBits(DefBits))
13217 return R;
13218 if (SDValue R = TryMOVIWithBits(UndefBits))
13219 return R;
13220
13221 // See if a fneg of the constant can be materialized with a MOVI, etc
13222 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13223 // FNegate each sub-element of the constant
13224 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13225 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13226 .zext(VT.getSizeInBits());
13227 APInt NegBits(VT.getSizeInBits(), 0);
13228 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13229 for (unsigned i = 0; i < NumElts; i++)
13230 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13231 NegBits = DefBits ^ NegBits;
13232
13233 // Try to create the new constants with MOVI, and if so generate a fneg
13234 // for it.
13235 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13236 SDLoc DL(Op);
13237 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13238 return DAG.getNode(
13240 DAG.getNode(ISD::FNEG, DL, VFVT,
13241 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13242 }
13243 return SDValue();
13244 };
13245 SDValue R;
13246 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13247 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13248 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13249 return R;
13250 }
13251
13252 return SDValue();
13253}
13254
13255SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13256 SelectionDAG &DAG) const {
13257 EVT VT = Op.getValueType();
13258
13259 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13260 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13261 SDLoc DL(Op);
13262 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13263 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13264 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13265 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13266 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13267 }
13268
13269 // Revert to common legalisation for all other variants.
13270 return SDValue();
13271 }
13272
13273 // Try to build a simple constant vector.
13274 Op = NormalizeBuildVector(Op, DAG);
13275 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13276 // abort.
13277 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13278 return SDValue();
13279
13280 // Certain vector constants, used to express things like logical NOT and
13281 // arithmetic NEG, are passed through unmodified. This allows special
13282 // patterns for these operations to match, which will lower these constants
13283 // to whatever is proven necessary.
13284 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13285 if (BVN->isConstant()) {
13286 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13287 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13288 APInt Val(BitSize,
13289 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13290 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13291 return Op;
13292 }
13293 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13294 if (Const->isZero() && !Const->isNegative())
13295 return Op;
13296 }
13297
13298 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13299 return V;
13300
13301 // Scan through the operands to find some interesting properties we can
13302 // exploit:
13303 // 1) If only one value is used, we can use a DUP, or
13304 // 2) if only the low element is not undef, we can just insert that, or
13305 // 3) if only one constant value is used (w/ some non-constant lanes),
13306 // we can splat the constant value into the whole vector then fill
13307 // in the non-constant lanes.
13308 // 4) FIXME: If different constant values are used, but we can intelligently
13309 // select the values we'll be overwriting for the non-constant
13310 // lanes such that we can directly materialize the vector
13311 // some other way (MOVI, e.g.), we can be sneaky.
13312 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13313 SDLoc dl(Op);
13314 unsigned NumElts = VT.getVectorNumElements();
13315 bool isOnlyLowElement = true;
13316 bool usesOnlyOneValue = true;
13317 bool usesOnlyOneConstantValue = true;
13318 bool isConstant = true;
13319 bool AllLanesExtractElt = true;
13320 unsigned NumConstantLanes = 0;
13321 unsigned NumDifferentLanes = 0;
13322 unsigned NumUndefLanes = 0;
13323 SDValue Value;
13324 SDValue ConstantValue;
13325 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13326 unsigned ConsecutiveValCount = 0;
13327 SDValue PrevVal;
13328 for (unsigned i = 0; i < NumElts; ++i) {
13329 SDValue V = Op.getOperand(i);
13330 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13331 AllLanesExtractElt = false;
13332 if (V.isUndef()) {
13333 ++NumUndefLanes;
13334 continue;
13335 }
13336 if (i > 0)
13337 isOnlyLowElement = false;
13338 if (!isIntOrFPConstant(V))
13339 isConstant = false;
13340
13341 if (isIntOrFPConstant(V)) {
13342 ++NumConstantLanes;
13343 if (!ConstantValue.getNode())
13344 ConstantValue = V;
13345 else if (ConstantValue != V)
13346 usesOnlyOneConstantValue = false;
13347 }
13348
13349 if (!Value.getNode())
13350 Value = V;
13351 else if (V != Value) {
13352 usesOnlyOneValue = false;
13353 ++NumDifferentLanes;
13354 }
13355
13356 if (PrevVal != V) {
13357 ConsecutiveValCount = 0;
13358 PrevVal = V;
13359 }
13360
13361 // Keep different values and its last consecutive count. For example,
13362 //
13363 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13364 // t24, t24, t24, t24, t24, t24, t24, t24
13365 // t23 = consecutive count 8
13366 // t24 = consecutive count 8
13367 // ------------------------------------------------------------------
13368 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13369 // t24, t24, t24, t24, t24, t24, t24, t24
13370 // t23 = consecutive count 5
13371 // t24 = consecutive count 9
13372 DifferentValueMap[V] = ++ConsecutiveValCount;
13373 }
13374
13375 if (!Value.getNode()) {
13376 LLVM_DEBUG(
13377 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13378 return DAG.getUNDEF(VT);
13379 }
13380
13381 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13382 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13383 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13384 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13385 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13386 "SCALAR_TO_VECTOR node\n");
13387 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13388 }
13389
13390 if (AllLanesExtractElt) {
13391 SDNode *Vector = nullptr;
13392 bool Even = false;
13393 bool Odd = false;
13394 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13395 // the Odd pattern <1,3,5,...>.
13396 for (unsigned i = 0; i < NumElts; ++i) {
13397 SDValue V = Op.getOperand(i);
13398 const SDNode *N = V.getNode();
13399 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13400 Even = false;
13401 Odd = false;
13402 break;
13403 }
13404 SDValue N0 = N->getOperand(0);
13405
13406 // All elements are extracted from the same vector.
13407 if (!Vector) {
13408 Vector = N0.getNode();
13409 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13410 // BUILD_VECTOR.
13411 if (VT.getVectorElementType() !=
13413 break;
13414 } else if (Vector != N0.getNode()) {
13415 Odd = false;
13416 Even = false;
13417 break;
13418 }
13419
13420 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13421 // indices <1,3,5,...>.
13422 uint64_t Val = N->getConstantOperandVal(1);
13423 if (Val == 2 * i) {
13424 Even = true;
13425 continue;
13426 }
13427 if (Val - 1 == 2 * i) {
13428 Odd = true;
13429 continue;
13430 }
13431
13432 // Something does not match: abort.
13433 Odd = false;
13434 Even = false;
13435 break;
13436 }
13437 if (Even || Odd) {
13438 SDValue LHS =
13440 DAG.getConstant(0, dl, MVT::i64));
13441 SDValue RHS =
13443 DAG.getConstant(NumElts, dl, MVT::i64));
13444
13445 if (Even && !Odd)
13446 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13447 RHS);
13448 if (Odd && !Even)
13449 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13450 RHS);
13451 }
13452 }
13453
13454 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13455 // i32 and try again.
13456 if (usesOnlyOneValue) {
13457 if (!isConstant) {
13458 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13459 Value.getValueType() != VT) {
13460 LLVM_DEBUG(
13461 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13462 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13463 }
13464
13465 // This is actually a DUPLANExx operation, which keeps everything vectory.
13466
13467 SDValue Lane = Value.getOperand(1);
13468 Value = Value.getOperand(0);
13469 if (Value.getValueSizeInBits() == 64) {
13470 LLVM_DEBUG(
13471 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13472 "widening it\n");
13473 Value = WidenVector(Value, DAG);
13474 }
13475
13476 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13477 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13478 }
13479
13482 EVT EltTy = VT.getVectorElementType();
13483 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13484 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13485 LLVM_DEBUG(
13486 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13487 "BITCASTS, and try again\n");
13488 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13489 for (unsigned i = 0; i < NumElts; ++i)
13490 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13491 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13492 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13493 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13494 Val.dump(););
13495 Val = LowerBUILD_VECTOR(Val, DAG);
13496 if (Val.getNode())
13497 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13498 }
13499 }
13500
13501 // If we need to insert a small number of different non-constant elements and
13502 // the vector width is sufficiently large, prefer using DUP with the common
13503 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13504 // skip the constant lane handling below.
13505 bool PreferDUPAndInsert =
13506 !isConstant && NumDifferentLanes >= 1 &&
13507 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13508 NumDifferentLanes >= NumConstantLanes;
13509
13510 // If there was only one constant value used and for more than one lane,
13511 // start by splatting that value, then replace the non-constant lanes. This
13512 // is better than the default, which will perform a separate initialization
13513 // for each lane.
13514 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13515 // Firstly, try to materialize the splat constant.
13516 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13517 unsigned BitSize = VT.getScalarSizeInBits();
13518 APInt ConstantValueAPInt(1, 0);
13519 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13520 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13521 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13522 !ConstantValueAPInt.isAllOnes()) {
13523 Val = ConstantBuildVector(Val, DAG, Subtarget);
13524 if (!Val)
13525 // Otherwise, materialize the constant and splat it.
13526 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13527 }
13528
13529 // Now insert the non-constant lanes.
13530 for (unsigned i = 0; i < NumElts; ++i) {
13531 SDValue V = Op.getOperand(i);
13532 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13533 if (!isIntOrFPConstant(V))
13534 // Note that type legalization likely mucked about with the VT of the
13535 // source operand, so we may have to convert it here before inserting.
13536 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13537 }
13538 return Val;
13539 }
13540
13541 // This will generate a load from the constant pool.
13542 if (isConstant) {
13543 LLVM_DEBUG(
13544 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13545 "expansion\n");
13546 return SDValue();
13547 }
13548
13549 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13550 // v4i32s. This is really a truncate, which we can construct out of (legal)
13551 // concats and truncate nodes.
13553 return M;
13554
13555 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13556 if (NumElts >= 4) {
13557 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13558 return Shuffle;
13559
13560 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13561 return Shuffle;
13562 }
13563
13564 if (PreferDUPAndInsert) {
13565 // First, build a constant vector with the common element.
13566 SmallVector<SDValue, 8> Ops(NumElts, Value);
13567 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13568 // Next, insert the elements that do not match the common value.
13569 for (unsigned I = 0; I < NumElts; ++I)
13570 if (Op.getOperand(I) != Value)
13571 NewVector =
13572 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13573 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13574
13575 return NewVector;
13576 }
13577
13578 // If vector consists of two different values, try to generate two DUPs and
13579 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13580 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13582 // Check the consecutive count of the value is the half number of vector
13583 // elements. In this case, we can use CONCAT_VECTORS. For example,
13584 //
13585 // canUseVECTOR_CONCAT = true;
13586 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13587 // t24, t24, t24, t24, t24, t24, t24, t24
13588 //
13589 // canUseVECTOR_CONCAT = false;
13590 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13591 // t24, t24, t24, t24, t24, t24, t24, t24
13592 bool canUseVECTOR_CONCAT = true;
13593 for (auto Pair : DifferentValueMap) {
13594 // Check different values have same length which is NumElts / 2.
13595 if (Pair.second != NumElts / 2)
13596 canUseVECTOR_CONCAT = false;
13597 Vals.push_back(Pair.first);
13598 }
13599
13600 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13601 // CONCAT_VECTORs. For example,
13602 //
13603 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13604 // t24, t24, t24, t24, t24, t24, t24, t24
13605 // ==>
13606 // t26: v8i8 = AArch64ISD::DUP t23
13607 // t28: v8i8 = AArch64ISD::DUP t24
13608 // t29: v16i8 = concat_vectors t26, t28
13609 if (canUseVECTOR_CONCAT) {
13610 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13611 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13612 SubVT.getVectorNumElements() >= 2) {
13613 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13614 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13615 SDValue DUP1 =
13616 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13617 SDValue DUP2 =
13618 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13620 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13621 return CONCAT_VECTORS;
13622 }
13623 }
13624
13625 // Let's try to generate VECTOR_SHUFFLE. For example,
13626 //
13627 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13628 // ==>
13629 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13630 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13631 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13632 if (NumElts >= 8) {
13633 SmallVector<int, 16> MaskVec;
13634 // Build mask for VECTOR_SHUFLLE.
13635 SDValue FirstLaneVal = Op.getOperand(0);
13636 for (unsigned i = 0; i < NumElts; ++i) {
13637 SDValue Val = Op.getOperand(i);
13638 if (FirstLaneVal == Val)
13639 MaskVec.push_back(i);
13640 else
13641 MaskVec.push_back(i + NumElts);
13642 }
13643
13644 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13645 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13646 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13647 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13649 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13650 return VECTOR_SHUFFLE;
13651 }
13652 }
13653
13654 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13655 // know the default expansion would otherwise fall back on something even
13656 // worse. For a vector with one or two non-undef values, that's
13657 // scalar_to_vector for the elements followed by a shuffle (provided the
13658 // shuffle is valid for the target) and materialization element by element
13659 // on the stack followed by a load for everything else.
13660 if (!isConstant && !usesOnlyOneValue) {
13661 LLVM_DEBUG(
13662 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13663 "of INSERT_VECTOR_ELT\n");
13664
13665 SDValue Vec = DAG.getUNDEF(VT);
13666 SDValue Op0 = Op.getOperand(0);
13667 unsigned i = 0;
13668
13669 // Use SCALAR_TO_VECTOR for lane zero to
13670 // a) Avoid a RMW dependency on the full vector register, and
13671 // b) Allow the register coalescer to fold away the copy if the
13672 // value is already in an S or D register, and we're forced to emit an
13673 // INSERT_SUBREG that we can't fold anywhere.
13674 //
13675 // We also allow types like i8 and i16 which are illegal scalar but legal
13676 // vector element types. After type-legalization the inserted value is
13677 // extended (i32) and it is safe to cast them to the vector type by ignoring
13678 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13679 if (!Op0.isUndef()) {
13680 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13681 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13682 ++i;
13683 }
13684 LLVM_DEBUG(if (i < NumElts) dbgs()
13685 << "Creating nodes for the other vector elements:\n";);
13686 for (; i < NumElts; ++i) {
13687 SDValue V = Op.getOperand(i);
13688 if (V.isUndef())
13689 continue;
13690 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13691 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13692 }
13693 return Vec;
13694 }
13695
13696 LLVM_DEBUG(
13697 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13698 "better alternative\n");
13699 return SDValue();
13700}
13701
13702SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13703 SelectionDAG &DAG) const {
13704 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13705 !Subtarget->isNeonAvailable()))
13706 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13707
13708 assert(Op.getValueType().isScalableVector() &&
13709 isTypeLegal(Op.getValueType()) &&
13710 "Expected legal scalable vector type!");
13711
13712 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13713 unsigned NumOperands = Op->getNumOperands();
13714 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13715 "Unexpected number of operands in CONCAT_VECTORS");
13716
13717 if (NumOperands == 2)
13718 return Op;
13719
13720 // Concat each pair of subvectors and pack into the lower half of the array.
13721 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13722 while (ConcatOps.size() > 1) {
13723 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13724 SDValue V1 = ConcatOps[I];
13725 SDValue V2 = ConcatOps[I + 1];
13726 EVT SubVT = V1.getValueType();
13727 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13728 ConcatOps[I / 2] =
13729 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13730 }
13731 ConcatOps.resize(ConcatOps.size() / 2);
13732 }
13733 return ConcatOps[0];
13734 }
13735
13736 return SDValue();
13737}
13738
13739SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13740 SelectionDAG &DAG) const {
13741 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13742
13743 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13744 !Subtarget->isNeonAvailable()))
13745 return LowerFixedLengthInsertVectorElt(Op, DAG);
13746
13747 EVT VT = Op.getOperand(0).getValueType();
13748
13749 if (VT.getScalarType() == MVT::i1) {
13750 EVT VectorVT = getPromotedVTForPredicate(VT);
13751 SDLoc DL(Op);
13752 SDValue ExtendedVector =
13753 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13754 SDValue ExtendedValue =
13755 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13756 VectorVT.getScalarType().getSizeInBits() < 32
13757 ? MVT::i32
13758 : VectorVT.getScalarType());
13759 ExtendedVector =
13760 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13761 ExtendedValue, Op.getOperand(2));
13762 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13763 }
13764
13765 // Check for non-constant or out of range lane.
13766 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13767 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13768 return SDValue();
13769
13770 return Op;
13771}
13772
13773SDValue
13774AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13775 SelectionDAG &DAG) const {
13776 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13777 EVT VT = Op.getOperand(0).getValueType();
13778
13779 if (VT.getScalarType() == MVT::i1) {
13780 // We can't directly extract from an SVE predicate; extend it first.
13781 // (This isn't the only possible lowering, but it's straightforward.)
13782 EVT VectorVT = getPromotedVTForPredicate(VT);
13783 SDLoc DL(Op);
13784 SDValue Extend =
13785 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13786 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13787 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13788 Extend, Op.getOperand(1));
13789 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13790 }
13791
13792 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13793 return LowerFixedLengthExtractVectorElt(Op, DAG);
13794
13795 // Check for non-constant or out of range lane.
13796 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13797 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13798 return SDValue();
13799
13800 // Insertion/extraction are legal for V128 types.
13801 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13802 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13803 VT == MVT::v8f16 || VT == MVT::v8bf16)
13804 return Op;
13805
13806 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13807 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13808 VT != MVT::v4bf16)
13809 return SDValue();
13810
13811 // For V64 types, we perform extraction by expanding the value
13812 // to a V128 type and perform the extraction on that.
13813 SDLoc DL(Op);
13814 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13815 EVT WideTy = WideVec.getValueType();
13816
13817 EVT ExtrTy = WideTy.getVectorElementType();
13818 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13819 ExtrTy = MVT::i32;
13820
13821 // For extractions, we just return the result directly.
13822 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13823 Op.getOperand(1));
13824}
13825
13826SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13827 SelectionDAG &DAG) const {
13828 assert(Op.getValueType().isFixedLengthVector() &&
13829 "Only cases that extract a fixed length vector are supported!");
13830
13831 EVT InVT = Op.getOperand(0).getValueType();
13832 unsigned Idx = Op.getConstantOperandVal(1);
13833 unsigned Size = Op.getValueSizeInBits();
13834
13835 // If we don't have legal types yet, do nothing
13836 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13837 return SDValue();
13838
13839 if (InVT.isScalableVector()) {
13840 // This will be matched by custom code during ISelDAGToDAG.
13841 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13842 return Op;
13843
13844 return SDValue();
13845 }
13846
13847 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13848 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13849 return Op;
13850
13851 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13852 // that directly.
13853 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13854 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13855 return Op;
13856
13857 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13858 SDLoc DL(Op);
13859
13860 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13861 SDValue NewInVec =
13862 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13863
13864 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13865 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13866 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13867 }
13868
13869 return SDValue();
13870}
13871
13872SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13873 SelectionDAG &DAG) const {
13874 assert(Op.getValueType().isScalableVector() &&
13875 "Only expect to lower inserts into scalable vectors!");
13876
13877 EVT InVT = Op.getOperand(1).getValueType();
13878 unsigned Idx = Op.getConstantOperandVal(2);
13879
13880 SDValue Vec0 = Op.getOperand(0);
13881 SDValue Vec1 = Op.getOperand(1);
13882 SDLoc DL(Op);
13883 EVT VT = Op.getValueType();
13884
13885 if (InVT.isScalableVector()) {
13886 if (!isTypeLegal(VT))
13887 return SDValue();
13888
13889 // Break down insert_subvector into simpler parts.
13890 if (VT.getVectorElementType() == MVT::i1) {
13891 unsigned NumElts = VT.getVectorMinNumElements();
13892 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13893
13894 SDValue Lo, Hi;
13895 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13896 DAG.getVectorIdxConstant(0, DL));
13897 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13898 DAG.getVectorIdxConstant(NumElts / 2, DL));
13899 if (Idx < (NumElts / 2))
13900 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13902 else
13903 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13904 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13905
13906 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13907 }
13908
13909 // Ensure the subvector is half the size of the main vector.
13910 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13911 return SDValue();
13912
13913 // Here narrow and wide refers to the vector element types. After "casting"
13914 // both vectors must have the same bit length and so because the subvector
13915 // has fewer elements, those elements need to be bigger.
13918
13919 // NOP cast operands to the largest legal vector of the same element count.
13920 if (VT.isFloatingPoint()) {
13921 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13922 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13923 } else {
13924 // Legal integer vectors are already their largest so Vec0 is fine as is.
13925 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13926 }
13927
13928 // To replace the top/bottom half of vector V with vector SubV we widen the
13929 // preserved half of V, concatenate this to SubV (the order depending on the
13930 // half being replaced) and then narrow the result.
13931 SDValue Narrow;
13932 if (Idx == 0) {
13933 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
13934 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
13935 } else {
13937 "Invalid subvector index!");
13938 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
13939 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
13940 }
13941
13942 return getSVESafeBitCast(VT, Narrow, DAG);
13943 }
13944
13945 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
13946 // This will be matched by custom code during ISelDAGToDAG.
13947 if (Vec0.isUndef())
13948 return Op;
13949
13950 std::optional<unsigned> PredPattern =
13952 auto PredTy = VT.changeVectorElementType(MVT::i1);
13953 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
13954 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
13955 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
13956 }
13957
13958 return SDValue();
13959}
13960
13961static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
13962 if (Op.getOpcode() != AArch64ISD::DUP &&
13963 Op.getOpcode() != ISD::SPLAT_VECTOR &&
13964 Op.getOpcode() != ISD::BUILD_VECTOR)
13965 return false;
13966
13967 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
13968 !isAllConstantBuildVector(Op, SplatVal))
13969 return false;
13970
13971 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
13972 !isa<ConstantSDNode>(Op->getOperand(0)))
13973 return false;
13974
13975 SplatVal = Op->getConstantOperandVal(0);
13976 if (Op.getValueType().getVectorElementType() != MVT::i64)
13977 SplatVal = (int32_t)SplatVal;
13978
13979 Negated = false;
13980 if (isPowerOf2_64(SplatVal))
13981 return true;
13982
13983 Negated = true;
13984 if (isPowerOf2_64(-SplatVal)) {
13985 SplatVal = -SplatVal;
13986 return true;
13987 }
13988
13989 return false;
13990}
13991
13992SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
13993 EVT VT = Op.getValueType();
13994 SDLoc dl(Op);
13995
13996 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
13997 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
13998
13999 assert(VT.isScalableVector() && "Expected a scalable vector.");
14000
14001 bool Signed = Op.getOpcode() == ISD::SDIV;
14002 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14003
14004 bool Negated;
14005 uint64_t SplatVal;
14006 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14007 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14008 SDValue Res =
14009 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14010 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14011 if (Negated)
14012 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14013
14014 return Res;
14015 }
14016
14017 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14018 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14019
14020 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14021 // operations, and truncate the result.
14022 EVT WidenedVT;
14023 if (VT == MVT::nxv16i8)
14024 WidenedVT = MVT::nxv8i16;
14025 else if (VT == MVT::nxv8i16)
14026 WidenedVT = MVT::nxv4i32;
14027 else
14028 llvm_unreachable("Unexpected Custom DIV operation");
14029
14030 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14031 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14032 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14033 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14034 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14035 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14036 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14037 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14038 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14039}
14040
14042 // Currently no fixed length shuffles that require SVE are legal.
14043 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14044 return false;
14045
14046 if (VT.getVectorNumElements() == 4 &&
14047 (VT.is128BitVector() || VT.is64BitVector())) {
14048 unsigned Cost = getPerfectShuffleCost(M);
14049 if (Cost <= 1)
14050 return true;
14051 }
14052
14053 bool DummyBool;
14054 int DummyInt;
14055 unsigned DummyUnsigned;
14056
14057 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
14058 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
14059 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14060 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
14061 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
14062 isZIPMask(M, VT, DummyUnsigned) ||
14063 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14064 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14065 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14066 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
14067 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14068}
14069
14071 EVT VT) const {
14072 // Just delegate to the generic legality, clear masks aren't special.
14073 return isShuffleMaskLegal(M, VT);
14074}
14075
14076/// getVShiftImm - Check if this is a valid build_vector for the immediate
14077/// operand of a vector shift operation, where all the elements of the
14078/// build_vector must have the same constant integer value.
14079static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14080 // Ignore bit_converts.
14081 while (Op.getOpcode() == ISD::BITCAST)
14082 Op = Op.getOperand(0);
14083 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14084 APInt SplatBits, SplatUndef;
14085 unsigned SplatBitSize;
14086 bool HasAnyUndefs;
14087 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14088 HasAnyUndefs, ElementBits) ||
14089 SplatBitSize > ElementBits)
14090 return false;
14091 Cnt = SplatBits.getSExtValue();
14092 return true;
14093}
14094
14095/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14096/// operand of a vector shift left operation. That value must be in the range:
14097/// 0 <= Value < ElementBits for a left shift; or
14098/// 0 <= Value <= ElementBits for a long left shift.
14099static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14100 assert(VT.isVector() && "vector shift count is not a vector type");
14101 int64_t ElementBits = VT.getScalarSizeInBits();
14102 if (!getVShiftImm(Op, ElementBits, Cnt))
14103 return false;
14104 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14105}
14106
14107/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14108/// operand of a vector shift right operation. The value must be in the range:
14109/// 1 <= Value <= ElementBits for a right shift; or
14110static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14111 assert(VT.isVector() && "vector shift count is not a vector type");
14112 int64_t ElementBits = VT.getScalarSizeInBits();
14113 if (!getVShiftImm(Op, ElementBits, Cnt))
14114 return false;
14115 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14116}
14117
14118SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14119 SelectionDAG &DAG) const {
14120 EVT VT = Op.getValueType();
14121
14122 if (VT.getScalarType() == MVT::i1) {
14123 // Lower i1 truncate to `(x & 1) != 0`.
14124 SDLoc dl(Op);
14125 EVT OpVT = Op.getOperand(0).getValueType();
14126 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14127 SDValue One = DAG.getConstant(1, dl, OpVT);
14128 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14129 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14130 }
14131
14132 if (!VT.isVector() || VT.isScalableVector())
14133 return SDValue();
14134
14135 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14136 !Subtarget->isNeonAvailable()))
14137 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14138
14139 return SDValue();
14140}
14141
14142// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14143// possibly a truncated type, it tells how many bits of the value are to be
14144// used.
14146 SelectionDAG &DAG,
14147 unsigned &ShiftValue,
14148 SDValue &RShOperand) {
14149 if (Shift->getOpcode() != ISD::SRL)
14150 return false;
14151
14152 EVT VT = Shift.getValueType();
14153 assert(VT.isScalableVT());
14154
14155 auto ShiftOp1 =
14156 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14157 if (!ShiftOp1)
14158 return false;
14159
14160 ShiftValue = ShiftOp1->getZExtValue();
14161 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14162 return false;
14163
14164 SDValue Add = Shift->getOperand(0);
14165 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14166 return false;
14167
14169 "ResVT must be truncated or same type as the shift.");
14170 // Check if an overflow can lead to incorrect results.
14171 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14172 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14173 return false;
14174
14175 auto AddOp1 =
14176 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14177 if (!AddOp1)
14178 return false;
14179 uint64_t AddValue = AddOp1->getZExtValue();
14180 if (AddValue != 1ULL << (ShiftValue - 1))
14181 return false;
14182
14183 RShOperand = Add->getOperand(0);
14184 return true;
14185}
14186
14187SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14188 SelectionDAG &DAG) const {
14189 EVT VT = Op.getValueType();
14190 SDLoc DL(Op);
14191 int64_t Cnt;
14192
14193 if (!Op.getOperand(1).getValueType().isVector())
14194 return Op;
14195 unsigned EltSize = VT.getScalarSizeInBits();
14196
14197 switch (Op.getOpcode()) {
14198 case ISD::SHL:
14199 if (VT.isScalableVector() ||
14201 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14202
14203 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14204 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14205 DAG.getConstant(Cnt, DL, MVT::i32));
14206 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14207 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14208 MVT::i32),
14209 Op.getOperand(0), Op.getOperand(1));
14210 case ISD::SRA:
14211 case ISD::SRL:
14212 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14213 SDValue RShOperand;
14214 unsigned ShiftValue;
14215 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14216 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14217 getPredicateForVector(DAG, DL, VT), RShOperand,
14218 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14219 }
14220
14221 if (VT.isScalableVector() ||
14222 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14223 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14225 return LowerToPredicatedOp(Op, DAG, Opc);
14226 }
14227
14228 // Right shift immediate
14229 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14230 unsigned Opc =
14231 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14232 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14233 DAG.getConstant(Cnt, DL, MVT::i32));
14234 }
14235
14236 // Right shift register. Note, there is not a shift right register
14237 // instruction, but the shift left register instruction takes a signed
14238 // value, where negative numbers specify a right shift.
14239 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14240 : Intrinsic::aarch64_neon_ushl;
14241 // negate the shift amount
14242 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14243 Op.getOperand(1));
14244 SDValue NegShiftLeft =
14246 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14247 NegShift);
14248 return NegShiftLeft;
14249 }
14250
14251 llvm_unreachable("unexpected shift opcode");
14252}
14253
14255 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14256 const SDLoc &dl, SelectionDAG &DAG) {
14257 EVT SrcVT = LHS.getValueType();
14258 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14259 "function only supposed to emit natural comparisons");
14260
14261 APInt SplatValue;
14262 APInt SplatUndef;
14263 unsigned SplatBitSize = 0;
14264 bool HasAnyUndefs;
14265
14266 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14267 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14268 SplatBitSize, HasAnyUndefs);
14269
14270 bool IsZero = IsCnst && SplatValue == 0;
14271 bool IsOne =
14272 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14273 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14274
14275 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14276 switch (CC) {
14277 default:
14278 return SDValue();
14279 case AArch64CC::NE: {
14280 SDValue Fcmeq;
14281 if (IsZero)
14282 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14283 else
14284 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14285 return DAG.getNOT(dl, Fcmeq, VT);
14286 }
14287 case AArch64CC::EQ:
14288 if (IsZero)
14289 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14290 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14291 case AArch64CC::GE:
14292 if (IsZero)
14293 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14294 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14295 case AArch64CC::GT:
14296 if (IsZero)
14297 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14298 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14299 case AArch64CC::LE:
14300 if (!NoNans)
14301 return SDValue();
14302 // If we ignore NaNs then we can use to the LS implementation.
14303 [[fallthrough]];
14304 case AArch64CC::LS:
14305 if (IsZero)
14306 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14307 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14308 case AArch64CC::LT:
14309 if (!NoNans)
14310 return SDValue();
14311 // If we ignore NaNs then we can use to the MI implementation.
14312 [[fallthrough]];
14313 case AArch64CC::MI:
14314 if (IsZero)
14315 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14316 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14317 }
14318 }
14319
14320 switch (CC) {
14321 default:
14322 return SDValue();
14323 case AArch64CC::NE: {
14324 SDValue Cmeq;
14325 if (IsZero)
14326 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14327 else
14328 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14329 return DAG.getNOT(dl, Cmeq, VT);
14330 }
14331 case AArch64CC::EQ:
14332 if (IsZero)
14333 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14334 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14335 case AArch64CC::GE:
14336 if (IsZero)
14337 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14338 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14339 case AArch64CC::GT:
14340 if (IsZero)
14341 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14342 if (IsMinusOne)
14343 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14344 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14345 case AArch64CC::LE:
14346 if (IsZero)
14347 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14348 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14349 case AArch64CC::LS:
14350 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14351 case AArch64CC::LO:
14352 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14353 case AArch64CC::LT:
14354 if (IsZero)
14355 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14356 if (IsOne)
14357 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14358 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14359 case AArch64CC::HI:
14360 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14361 case AArch64CC::HS:
14362 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14363 }
14364}
14365
14366SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14367 SelectionDAG &DAG) const {
14368 if (Op.getValueType().isScalableVector())
14369 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14370
14371 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14372 !Subtarget->isNeonAvailable()))
14373 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14374
14375 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14376 SDValue LHS = Op.getOperand(0);
14377 SDValue RHS = Op.getOperand(1);
14378 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14379 SDLoc dl(Op);
14380
14381 if (LHS.getValueType().getVectorElementType().isInteger()) {
14382 assert(LHS.getValueType() == RHS.getValueType());
14384 SDValue Cmp =
14385 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14386 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14387 }
14388
14389 // Lower isnan(x) | isnan(never-nan) to x != x.
14390 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14391 if (CC == ISD::SETUO || CC == ISD::SETO) {
14392 bool OneNaN = false;
14393 if (LHS == RHS) {
14394 OneNaN = true;
14395 } else if (DAG.isKnownNeverNaN(RHS)) {
14396 OneNaN = true;
14397 RHS = LHS;
14398 } else if (DAG.isKnownNeverNaN(LHS)) {
14399 OneNaN = true;
14400 LHS = RHS;
14401 }
14402 if (OneNaN) {
14404 }
14405 }
14406
14407 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14408
14409 // Make v4f16 (only) fcmp operations utilise vector instructions
14410 // v8f16 support will be a litle more complicated
14411 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14412 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14413 if (LHS.getValueType().getVectorNumElements() == 4) {
14414 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14415 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14416 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14417 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14418 CmpVT = MVT::v4i32;
14419 } else
14420 return SDValue();
14421 }
14422
14423 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14424 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14425 LHS.getValueType().getVectorElementType() != MVT::f128);
14426
14427 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14428 // clean. Some of them require two branches to implement.
14429 AArch64CC::CondCode CC1, CC2;
14430 bool ShouldInvert;
14431 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14432
14433 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14434 SDValue Cmp =
14435 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14436 if (!Cmp.getNode())
14437 return SDValue();
14438
14439 if (CC2 != AArch64CC::AL) {
14440 SDValue Cmp2 =
14441 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14442 if (!Cmp2.getNode())
14443 return SDValue();
14444
14445 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14446 }
14447
14448 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14449
14450 if (ShouldInvert)
14451 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14452
14453 return Cmp;
14454}
14455
14456static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14457 SelectionDAG &DAG) {
14458 SDValue VecOp = ScalarOp.getOperand(0);
14459 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14460 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14461 DAG.getConstant(0, DL, MVT::i64));
14462}
14463
14464static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14465 SDLoc DL, SelectionDAG &DAG) {
14466 unsigned ScalarOpcode;
14467 switch (Opcode) {
14468 case ISD::VECREDUCE_AND:
14469 ScalarOpcode = ISD::AND;
14470 break;
14471 case ISD::VECREDUCE_OR:
14472 ScalarOpcode = ISD::OR;
14473 break;
14474 case ISD::VECREDUCE_XOR:
14475 ScalarOpcode = ISD::XOR;
14476 break;
14477 default:
14478 llvm_unreachable("Expected bitwise vector reduction");
14479 return SDValue();
14480 }
14481
14482 EVT VecVT = Vec.getValueType();
14483 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14484 "Expected power-of-2 length vector");
14485
14486 EVT ElemVT = VecVT.getVectorElementType();
14487
14488 SDValue Result;
14489 unsigned NumElems = VecVT.getVectorNumElements();
14490
14491 // Special case for boolean reductions
14492 if (ElemVT == MVT::i1) {
14493 // Split large vectors into smaller ones
14494 if (NumElems > 16) {
14495 SDValue Lo, Hi;
14496 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14497 EVT HalfVT = Lo.getValueType();
14498 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14499 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14500 }
14501
14502 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14503 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14504 // this element size leads to the best codegen, since e.g. setcc results
14505 // might need to be truncated otherwise.
14506 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14507
14508 // any_ext doesn't work with umin/umax, so only use it for uadd.
14509 unsigned ExtendOp =
14510 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14511 SDValue Extended = DAG.getNode(
14512 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14513 switch (ScalarOpcode) {
14514 case ISD::AND:
14515 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14516 break;
14517 case ISD::OR:
14518 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14519 break;
14520 case ISD::XOR:
14521 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14522 break;
14523 default:
14524 llvm_unreachable("Unexpected Opcode");
14525 }
14526
14527 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14528 } else {
14529 // Iteratively split the vector in half and combine using the bitwise
14530 // operation until it fits in a 64 bit register.
14531 while (VecVT.getSizeInBits() > 64) {
14532 SDValue Lo, Hi;
14533 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14534 VecVT = Lo.getValueType();
14535 NumElems = VecVT.getVectorNumElements();
14536 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14537 }
14538
14539 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14540
14541 // Do the remaining work on a scalar since it allows the code generator to
14542 // combine the shift and bitwise operation into one instruction and since
14543 // integer instructions can have higher throughput than vector instructions.
14544 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14545
14546 // Iteratively combine the lower and upper halves of the scalar using the
14547 // bitwise operation, halving the relevant region of the scalar in each
14548 // iteration, until the relevant region is just one element of the original
14549 // vector.
14550 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14551 SDValue ShiftAmount =
14552 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14553 SDValue Shifted =
14554 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14555 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14556 }
14557
14558 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14559 }
14560
14561 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14562}
14563
14564SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14565 SelectionDAG &DAG) const {
14566 SDValue Src = Op.getOperand(0);
14567
14568 // Try to lower fixed length reductions to SVE.
14569 EVT SrcVT = Src.getValueType();
14570 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14571 Op.getOpcode() == ISD::VECREDUCE_AND ||
14572 Op.getOpcode() == ISD::VECREDUCE_OR ||
14573 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14574 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14575 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14576 SrcVT.getVectorElementType() == MVT::i64);
14577 if (SrcVT.isScalableVector() ||
14579 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14580
14581 if (SrcVT.getVectorElementType() == MVT::i1)
14582 return LowerPredReductionToSVE(Op, DAG);
14583
14584 switch (Op.getOpcode()) {
14585 case ISD::VECREDUCE_ADD:
14586 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14587 case ISD::VECREDUCE_AND:
14588 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14589 case ISD::VECREDUCE_OR:
14590 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14592 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14594 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14596 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14598 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14599 case ISD::VECREDUCE_XOR:
14600 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14602 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14604 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14606 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14608 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14610 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14611 default:
14612 llvm_unreachable("Unhandled fixed length reduction");
14613 }
14614 }
14615
14616 // Lower NEON reductions.
14617 SDLoc dl(Op);
14618 switch (Op.getOpcode()) {
14619 case ISD::VECREDUCE_AND:
14620 case ISD::VECREDUCE_OR:
14621 case ISD::VECREDUCE_XOR:
14622 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14623 Op.getValueType(), dl, DAG);
14624 case ISD::VECREDUCE_ADD:
14625 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14627 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14629 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14631 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14633 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14634 default:
14635 llvm_unreachable("Unhandled reduction");
14636 }
14637}
14638
14639SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14640 SelectionDAG &DAG) const {
14641 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14642 // No point replacing if we don't have the relevant instruction/libcall anyway
14643 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14644 return SDValue();
14645
14646 // LSE has an atomic load-clear instruction, but not a load-and.
14647 SDLoc dl(Op);
14648 MVT VT = Op.getSimpleValueType();
14649 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14650 SDValue RHS = Op.getOperand(2);
14651 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14652 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14653 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14654 Op.getOperand(0), Op.getOperand(1), RHS,
14655 AN->getMemOperand());
14656}
14657
14658SDValue
14659AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14660 SelectionDAG &DAG) const {
14661
14662 SDLoc dl(Op);
14663 // Get the inputs.
14664 SDNode *Node = Op.getNode();
14665 SDValue Chain = Op.getOperand(0);
14666 SDValue Size = Op.getOperand(1);
14668 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14669 EVT VT = Node->getValueType(0);
14670
14672 "no-stack-arg-probe")) {
14673 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14674 Chain = SP.getValue(1);
14675 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14676 if (Align)
14677 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14678 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14679 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14680 SDValue Ops[2] = {SP, Chain};
14681 return DAG.getMergeValues(Ops, dl);
14682 }
14683
14684 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14685
14686 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14688 PtrVT, 0);
14689
14690 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14691 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14692 if (Subtarget->hasCustomCallingConv())
14693 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14694
14695 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14696 DAG.getConstant(4, dl, MVT::i64));
14697 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14698 Chain =
14699 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14700 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14701 DAG.getRegisterMask(Mask), Chain.getValue(1));
14702 // To match the actual intent better, we should read the output from X15 here
14703 // again (instead of potentially spilling it to the stack), but rereading Size
14704 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14705 // here.
14706
14707 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14708 DAG.getConstant(4, dl, MVT::i64));
14709
14710 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14711 Chain = SP.getValue(1);
14712 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14713 if (Align)
14714 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14715 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14716 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14717
14718 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14719
14720 SDValue Ops[2] = {SP, Chain};
14721 return DAG.getMergeValues(Ops, dl);
14722}
14723
14724SDValue
14725AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14726 SelectionDAG &DAG) const {
14727 // Get the inputs.
14728 SDNode *Node = Op.getNode();
14729 SDValue Chain = Op.getOperand(0);
14730 SDValue Size = Op.getOperand(1);
14731
14733 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14734 SDLoc dl(Op);
14735 EVT VT = Node->getValueType(0);
14736
14737 // Construct the new SP value in a GPR.
14738 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14739 Chain = SP.getValue(1);
14740 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14741 if (Align)
14742 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14743 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14744
14745 // Set the real SP to the new value with a probing loop.
14746 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14747 SDValue Ops[2] = {SP, Chain};
14748 return DAG.getMergeValues(Ops, dl);
14749}
14750
14751SDValue
14752AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14753 SelectionDAG &DAG) const {
14755
14756 if (Subtarget->isTargetWindows())
14757 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14758 else if (hasInlineStackProbe(MF))
14759 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14760 else
14761 return SDValue();
14762}
14763
14764// When x and y are extended, lower:
14765// avgfloor(x, y) -> (x + y) >> 1
14766// avgceil(x, y) -> (x + y + 1) >> 1
14767
14768// Otherwise, lower to:
14769// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14770// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14771SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14772 unsigned NewOp) const {
14773 if (Subtarget->hasSVE2())
14774 return LowerToPredicatedOp(Op, DAG, NewOp);
14775
14776 SDLoc dl(Op);
14777 SDValue OpA = Op->getOperand(0);
14778 SDValue OpB = Op->getOperand(1);
14779 EVT VT = Op.getValueType();
14780 bool IsCeil =
14781 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14782 bool IsSigned =
14783 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14784 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14785
14786 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14787
14788 auto IsZeroExtended = [&DAG](SDValue &Node) {
14789 KnownBits Known = DAG.computeKnownBits(Node, 0);
14790 return Known.Zero.isSignBitSet();
14791 };
14792
14793 auto IsSignExtended = [&DAG](SDValue &Node) {
14794 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14795 };
14796
14797 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14798 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14799 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14800 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14801 if (IsCeil)
14802 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14803 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14804 }
14805
14806 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14807 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14808
14809 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14810 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14811 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14812 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14813}
14814
14815SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14816 SelectionDAG &DAG) const {
14817 EVT VT = Op.getValueType();
14818 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14819
14820 SDLoc DL(Op);
14821 APInt MulImm = Op.getConstantOperandAPInt(0);
14822 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14823 VT);
14824}
14825
14826/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14827template <unsigned NumVecs>
14828static bool
14832 // Retrieve EC from first vector argument.
14833 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14835#ifndef NDEBUG
14836 // Check the assumption that all input vectors are the same type.
14837 for (unsigned I = 0; I < NumVecs; ++I)
14838 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14839 "Invalid type.");
14840#endif
14841 // memVT is `NumVecs * VT`.
14843 EC * NumVecs);
14844 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14845 Info.offset = 0;
14846 Info.align.reset();
14848 return true;
14849}
14850
14851/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14852/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14853/// specified in the intrinsic calls.
14855 const CallInst &I,
14856 MachineFunction &MF,
14857 unsigned Intrinsic) const {
14858 auto &DL = I.getModule()->getDataLayout();
14859 switch (Intrinsic) {
14860 case Intrinsic::aarch64_sve_st2:
14861 return setInfoSVEStN<2>(*this, DL, Info, I);
14862 case Intrinsic::aarch64_sve_st3:
14863 return setInfoSVEStN<3>(*this, DL, Info, I);
14864 case Intrinsic::aarch64_sve_st4:
14865 return setInfoSVEStN<4>(*this, DL, Info, I);
14866 case Intrinsic::aarch64_neon_ld2:
14867 case Intrinsic::aarch64_neon_ld3:
14868 case Intrinsic::aarch64_neon_ld4:
14869 case Intrinsic::aarch64_neon_ld1x2:
14870 case Intrinsic::aarch64_neon_ld1x3:
14871 case Intrinsic::aarch64_neon_ld1x4: {
14873 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14874 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14875 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14876 Info.offset = 0;
14877 Info.align.reset();
14878 // volatile loads with NEON intrinsics not supported
14880 return true;
14881 }
14882 case Intrinsic::aarch64_neon_ld2lane:
14883 case Intrinsic::aarch64_neon_ld3lane:
14884 case Intrinsic::aarch64_neon_ld4lane:
14885 case Intrinsic::aarch64_neon_ld2r:
14886 case Intrinsic::aarch64_neon_ld3r:
14887 case Intrinsic::aarch64_neon_ld4r: {
14889 // ldx return struct with the same vec type
14890 Type *RetTy = I.getType();
14891 auto *StructTy = cast<StructType>(RetTy);
14892 unsigned NumElts = StructTy->getNumElements();
14893 Type *VecTy = StructTy->getElementType(0);
14894 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14895 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14896 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14897 Info.offset = 0;
14898 Info.align.reset();
14899 // volatile loads with NEON intrinsics not supported
14901 return true;
14902 }
14903 case Intrinsic::aarch64_neon_st2:
14904 case Intrinsic::aarch64_neon_st3:
14905 case Intrinsic::aarch64_neon_st4:
14906 case Intrinsic::aarch64_neon_st1x2:
14907 case Intrinsic::aarch64_neon_st1x3:
14908 case Intrinsic::aarch64_neon_st1x4: {
14910 unsigned NumElts = 0;
14911 for (const Value *Arg : I.args()) {
14912 Type *ArgTy = Arg->getType();
14913 if (!ArgTy->isVectorTy())
14914 break;
14915 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14916 }
14917 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14918 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14919 Info.offset = 0;
14920 Info.align.reset();
14921 // volatile stores with NEON intrinsics not supported
14923 return true;
14924 }
14925 case Intrinsic::aarch64_neon_st2lane:
14926 case Intrinsic::aarch64_neon_st3lane:
14927 case Intrinsic::aarch64_neon_st4lane: {
14929 unsigned NumElts = 0;
14930 // all the vector type is same
14931 Type *VecTy = I.getArgOperand(0)->getType();
14932 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14933
14934 for (const Value *Arg : I.args()) {
14935 Type *ArgTy = Arg->getType();
14936 if (!ArgTy->isVectorTy())
14937 break;
14938 NumElts += 1;
14939 }
14940
14941 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14942 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14943 Info.offset = 0;
14944 Info.align.reset();
14945 // volatile stores with NEON intrinsics not supported
14947 return true;
14948 }
14949 case Intrinsic::aarch64_ldaxr:
14950 case Intrinsic::aarch64_ldxr: {
14951 Type *ValTy = I.getParamElementType(0);
14953 Info.memVT = MVT::getVT(ValTy);
14954 Info.ptrVal = I.getArgOperand(0);
14955 Info.offset = 0;
14956 Info.align = DL.getABITypeAlign(ValTy);
14958 return true;
14959 }
14960 case Intrinsic::aarch64_stlxr:
14961 case Intrinsic::aarch64_stxr: {
14962 Type *ValTy = I.getParamElementType(1);
14964 Info.memVT = MVT::getVT(ValTy);
14965 Info.ptrVal = I.getArgOperand(1);
14966 Info.offset = 0;
14967 Info.align = DL.getABITypeAlign(ValTy);
14969 return true;
14970 }
14971 case Intrinsic::aarch64_ldaxp:
14972 case Intrinsic::aarch64_ldxp:
14974 Info.memVT = MVT::i128;
14975 Info.ptrVal = I.getArgOperand(0);
14976 Info.offset = 0;
14977 Info.align = Align(16);
14979 return true;
14980 case Intrinsic::aarch64_stlxp:
14981 case Intrinsic::aarch64_stxp:
14983 Info.memVT = MVT::i128;
14984 Info.ptrVal = I.getArgOperand(2);
14985 Info.offset = 0;
14986 Info.align = Align(16);
14988 return true;
14989 case Intrinsic::aarch64_sve_ldnt1: {
14990 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
14992 Info.memVT = MVT::getVT(I.getType());
14993 Info.ptrVal = I.getArgOperand(1);
14994 Info.offset = 0;
14995 Info.align = DL.getABITypeAlign(ElTy);
14997 return true;
14998 }
14999 case Intrinsic::aarch64_sve_stnt1: {
15000 Type *ElTy =
15001 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15003 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15004 Info.ptrVal = I.getArgOperand(2);
15005 Info.offset = 0;
15006 Info.align = DL.getABITypeAlign(ElTy);
15008 return true;
15009 }
15010 case Intrinsic::aarch64_mops_memset_tag: {
15011 Value *Dst = I.getArgOperand(0);
15012 Value *Val = I.getArgOperand(1);
15014 Info.memVT = MVT::getVT(Val->getType());
15015 Info.ptrVal = Dst;
15016 Info.offset = 0;
15017 Info.align = I.getParamAlign(0).valueOrOne();
15019 // The size of the memory being operated on is unknown at this point
15021 return true;
15022 }
15023 default:
15024 break;
15025 }
15026
15027 return false;
15028}
15029
15031 ISD::LoadExtType ExtTy,
15032 EVT NewVT) const {
15033 // TODO: This may be worth removing. Check regression tests for diffs.
15034 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15035 return false;
15036
15037 // If we're reducing the load width in order to avoid having to use an extra
15038 // instruction to do extension then it's probably a good idea.
15039 if (ExtTy != ISD::NON_EXTLOAD)
15040 return true;
15041 // Don't reduce load width if it would prevent us from combining a shift into
15042 // the offset.
15043 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15044 assert(Mem);
15045 const SDValue &Base = Mem->getBasePtr();
15046 if (Base.getOpcode() == ISD::ADD &&
15047 Base.getOperand(1).getOpcode() == ISD::SHL &&
15048 Base.getOperand(1).hasOneUse() &&
15049 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15050 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15051 if (Mem->getMemoryVT().isScalableVector())
15052 return false;
15053 // The shift can be combined if it matches the size of the value being
15054 // loaded (and so reducing the width would make it not match).
15055 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15056 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15057 if (ShiftAmount == Log2_32(LoadBytes))
15058 return false;
15059 }
15060 // We have no reason to disallow reducing the load width, so allow it.
15061 return true;
15062}
15063
15064// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15066 EVT VT = Extend.getValueType();
15067 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15068 SDValue Extract = Extend.getOperand(0);
15069 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15070 Extract = Extract.getOperand(0);
15071 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15072 EVT VecVT = Extract.getOperand(0).getValueType();
15073 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15074 return false;
15075 }
15076 }
15077 return true;
15078}
15079
15080// Truncations from 64-bit GPR to 32-bit GPR is free.
15082 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15083 return false;
15084 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15085 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15086 return NumBits1 > NumBits2;
15087}
15089 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15090 return false;
15091 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15092 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15093 return NumBits1 > NumBits2;
15094}
15095
15096/// Check if it is profitable to hoist instruction in then/else to if.
15097/// Not profitable if I and it's user can form a FMA instruction
15098/// because we prefer FMSUB/FMADD.
15100 if (I->getOpcode() != Instruction::FMul)
15101 return true;
15102
15103 if (!I->hasOneUse())
15104 return true;
15105
15106 Instruction *User = I->user_back();
15107
15108 if (!(User->getOpcode() == Instruction::FSub ||
15109 User->getOpcode() == Instruction::FAdd))
15110 return true;
15111
15113 const Function *F = I->getFunction();
15114 const DataLayout &DL = F->getParent()->getDataLayout();
15115 Type *Ty = User->getOperand(0)->getType();
15116
15117 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15119 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15120 Options.UnsafeFPMath));
15121}
15122
15123// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15124// 64-bit GPR.
15126 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15127 return false;
15128 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15129 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15130 return NumBits1 == 32 && NumBits2 == 64;
15131}
15133 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15134 return false;
15135 unsigned NumBits1 = VT1.getSizeInBits();
15136 unsigned NumBits2 = VT2.getSizeInBits();
15137 return NumBits1 == 32 && NumBits2 == 64;
15138}
15139
15141 EVT VT1 = Val.getValueType();
15142 if (isZExtFree(VT1, VT2)) {
15143 return true;
15144 }
15145
15146 if (Val.getOpcode() != ISD::LOAD)
15147 return false;
15148
15149 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15150 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15151 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15152 VT1.getSizeInBits() <= 32);
15153}
15154
15155bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15156 if (isa<FPExtInst>(Ext))
15157 return false;
15158
15159 // Vector types are not free.
15160 if (Ext->getType()->isVectorTy())
15161 return false;
15162
15163 for (const Use &U : Ext->uses()) {
15164 // The extension is free if we can fold it with a left shift in an
15165 // addressing mode or an arithmetic operation: add, sub, and cmp.
15166
15167 // Is there a shift?
15168 const Instruction *Instr = cast<Instruction>(U.getUser());
15169
15170 // Is this a constant shift?
15171 switch (Instr->getOpcode()) {
15172 case Instruction::Shl:
15173 if (!isa<ConstantInt>(Instr->getOperand(1)))
15174 return false;
15175 break;
15176 case Instruction::GetElementPtr: {
15177 gep_type_iterator GTI = gep_type_begin(Instr);
15178 auto &DL = Ext->getModule()->getDataLayout();
15179 std::advance(GTI, U.getOperandNo()-1);
15180 Type *IdxTy = GTI.getIndexedType();
15181 // This extension will end up with a shift because of the scaling factor.
15182 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15183 // Get the shift amount based on the scaling factor:
15184 // log2(sizeof(IdxTy)) - log2(8).
15185 if (IdxTy->isScalableTy())
15186 return false;
15187 uint64_t ShiftAmt =
15188 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15189 3;
15190 // Is the constant foldable in the shift of the addressing mode?
15191 // I.e., shift amount is between 1 and 4 inclusive.
15192 if (ShiftAmt == 0 || ShiftAmt > 4)
15193 return false;
15194 break;
15195 }
15196 case Instruction::Trunc:
15197 // Check if this is a noop.
15198 // trunc(sext ty1 to ty2) to ty1.
15199 if (Instr->getType() == Ext->getOperand(0)->getType())
15200 continue;
15201 [[fallthrough]];
15202 default:
15203 return false;
15204 }
15205
15206 // At this point we can use the bfm family, so this extension is free
15207 // for that use.
15208 }
15209 return true;
15210}
15211
15212static bool isSplatShuffle(Value *V) {
15213 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15214 return all_equal(Shuf->getShuffleMask());
15215 return false;
15216}
15217
15218/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15219/// or upper half of the vector elements.
15220static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15221 bool AllowSplat = false) {
15222 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15223 auto *FullTy = FullV->getType();
15224 auto *HalfTy = HalfV->getType();
15225 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15226 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15227 };
15228
15229 auto extractHalf = [](Value *FullV, Value *HalfV) {
15230 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15231 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15232 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15233 };
15234
15235 ArrayRef<int> M1, M2;
15236 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15237 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15238 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15239 return false;
15240
15241 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15242 // it is not checked as an extract below.
15243 if (AllowSplat && isSplatShuffle(Op1))
15244 S1Op1 = nullptr;
15245 if (AllowSplat && isSplatShuffle(Op2))
15246 S2Op1 = nullptr;
15247
15248 // Check that the operands are half as wide as the result and we extract
15249 // half of the elements of the input vectors.
15250 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15251 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15252 return false;
15253
15254 // Check the mask extracts either the lower or upper half of vector
15255 // elements.
15256 int M1Start = 0;
15257 int M2Start = 0;
15258 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15259 if ((S1Op1 &&
15260 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15261 (S2Op1 &&
15262 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15263 return false;
15264
15265 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15266 (M2Start != 0 && M2Start != (NumElements / 2)))
15267 return false;
15268 if (S1Op1 && S2Op1 && M1Start != M2Start)
15269 return false;
15270
15271 return true;
15272}
15273
15274/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15275/// of the vector elements.
15276static bool areExtractExts(Value *Ext1, Value *Ext2) {
15277 auto areExtDoubled = [](Instruction *Ext) {
15278 return Ext->getType()->getScalarSizeInBits() ==
15279 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15280 };
15281
15282 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15283 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15284 !areExtDoubled(cast<Instruction>(Ext1)) ||
15285 !areExtDoubled(cast<Instruction>(Ext2)))
15286 return false;
15287
15288 return true;
15289}
15290
15291/// Check if Op could be used with vmull_high_p64 intrinsic.
15293 Value *VectorOperand = nullptr;
15294 ConstantInt *ElementIndex = nullptr;
15295 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15296 m_ConstantInt(ElementIndex))) &&
15297 ElementIndex->getValue() == 1 &&
15298 isa<FixedVectorType>(VectorOperand->getType()) &&
15299 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15300}
15301
15302/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15303static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15305}
15306
15308 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15309 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15310 if (!GEP || GEP->getNumOperands() != 2)
15311 return false;
15312
15313 Value *Base = GEP->getOperand(0);
15314 Value *Offsets = GEP->getOperand(1);
15315
15316 // We only care about scalar_base+vector_offsets.
15317 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15318 return false;
15319
15320 // Sink extends that would allow us to use 32-bit offset vectors.
15321 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15322 auto *OffsetsInst = cast<Instruction>(Offsets);
15323 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15324 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15325 Ops.push_back(&GEP->getOperandUse(1));
15326 }
15327
15328 // Sink the GEP.
15329 return true;
15330}
15331
15332/// We want to sink following cases:
15333/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15335 if (match(Op, m_VScale()))
15336 return true;
15337 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15339 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15340 return true;
15341 }
15342 return false;
15343}
15344
15345/// Check if sinking \p I's operands to I's basic block is profitable, because
15346/// the operands can be folded into a target instruction, e.g.
15347/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15349 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15350 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15351 switch (II->getIntrinsicID()) {
15352 case Intrinsic::aarch64_neon_smull:
15353 case Intrinsic::aarch64_neon_umull:
15354 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15355 /*AllowSplat=*/true)) {
15356 Ops.push_back(&II->getOperandUse(0));
15357 Ops.push_back(&II->getOperandUse(1));
15358 return true;
15359 }
15360 [[fallthrough]];
15361
15362 case Intrinsic::fma:
15363 if (isa<VectorType>(I->getType()) &&
15364 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15365 !Subtarget->hasFullFP16())
15366 return false;
15367 [[fallthrough]];
15368 case Intrinsic::aarch64_neon_sqdmull:
15369 case Intrinsic::aarch64_neon_sqdmulh:
15370 case Intrinsic::aarch64_neon_sqrdmulh:
15371 // Sink splats for index lane variants
15372 if (isSplatShuffle(II->getOperand(0)))
15373 Ops.push_back(&II->getOperandUse(0));
15374 if (isSplatShuffle(II->getOperand(1)))
15375 Ops.push_back(&II->getOperandUse(1));
15376 return !Ops.empty();
15377 case Intrinsic::aarch64_neon_fmlal:
15378 case Intrinsic::aarch64_neon_fmlal2:
15379 case Intrinsic::aarch64_neon_fmlsl:
15380 case Intrinsic::aarch64_neon_fmlsl2:
15381 // Sink splats for index lane variants
15382 if (isSplatShuffle(II->getOperand(1)))
15383 Ops.push_back(&II->getOperandUse(1));
15384 if (isSplatShuffle(II->getOperand(2)))
15385 Ops.push_back(&II->getOperandUse(2));
15386 return !Ops.empty();
15387 case Intrinsic::aarch64_sve_ptest_first:
15388 case Intrinsic::aarch64_sve_ptest_last:
15389 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15390 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15391 Ops.push_back(&II->getOperandUse(0));
15392 return !Ops.empty();
15393 case Intrinsic::aarch64_sme_write_horiz:
15394 case Intrinsic::aarch64_sme_write_vert:
15395 case Intrinsic::aarch64_sme_writeq_horiz:
15396 case Intrinsic::aarch64_sme_writeq_vert: {
15397 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15398 if (!Idx || Idx->getOpcode() != Instruction::Add)
15399 return false;
15400 Ops.push_back(&II->getOperandUse(1));
15401 return true;
15402 }
15403 case Intrinsic::aarch64_sme_read_horiz:
15404 case Intrinsic::aarch64_sme_read_vert:
15405 case Intrinsic::aarch64_sme_readq_horiz:
15406 case Intrinsic::aarch64_sme_readq_vert:
15407 case Intrinsic::aarch64_sme_ld1b_vert:
15408 case Intrinsic::aarch64_sme_ld1h_vert:
15409 case Intrinsic::aarch64_sme_ld1w_vert:
15410 case Intrinsic::aarch64_sme_ld1d_vert:
15411 case Intrinsic::aarch64_sme_ld1q_vert:
15412 case Intrinsic::aarch64_sme_st1b_vert:
15413 case Intrinsic::aarch64_sme_st1h_vert:
15414 case Intrinsic::aarch64_sme_st1w_vert:
15415 case Intrinsic::aarch64_sme_st1d_vert:
15416 case Intrinsic::aarch64_sme_st1q_vert:
15417 case Intrinsic::aarch64_sme_ld1b_horiz:
15418 case Intrinsic::aarch64_sme_ld1h_horiz:
15419 case Intrinsic::aarch64_sme_ld1w_horiz:
15420 case Intrinsic::aarch64_sme_ld1d_horiz:
15421 case Intrinsic::aarch64_sme_ld1q_horiz:
15422 case Intrinsic::aarch64_sme_st1b_horiz:
15423 case Intrinsic::aarch64_sme_st1h_horiz:
15424 case Intrinsic::aarch64_sme_st1w_horiz:
15425 case Intrinsic::aarch64_sme_st1d_horiz:
15426 case Intrinsic::aarch64_sme_st1q_horiz: {
15427 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15428 if (!Idx || Idx->getOpcode() != Instruction::Add)
15429 return false;
15430 Ops.push_back(&II->getOperandUse(3));
15431 return true;
15432 }
15433 case Intrinsic::aarch64_neon_pmull:
15434 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15435 return false;
15436 Ops.push_back(&II->getOperandUse(0));
15437 Ops.push_back(&II->getOperandUse(1));
15438 return true;
15439 case Intrinsic::aarch64_neon_pmull64:
15440 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15441 II->getArgOperand(1)))
15442 return false;
15443 Ops.push_back(&II->getArgOperandUse(0));
15444 Ops.push_back(&II->getArgOperandUse(1));
15445 return true;
15446 case Intrinsic::masked_gather:
15447 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15448 return false;
15449 Ops.push_back(&II->getArgOperandUse(0));
15450 return true;
15451 case Intrinsic::masked_scatter:
15452 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15453 return false;
15454 Ops.push_back(&II->getArgOperandUse(1));
15455 return true;
15456 default:
15457 return false;
15458 }
15459 }
15460
15461 // Sink vscales closer to uses for better isel
15462 switch (I->getOpcode()) {
15463 case Instruction::GetElementPtr:
15464 case Instruction::Add:
15465 case Instruction::Sub:
15466 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15467 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15468 Ops.push_back(&I->getOperandUse(Op));
15469 return true;
15470 }
15471 }
15472 break;
15473 default:
15474 break;
15475 }
15476
15477 if (!I->getType()->isVectorTy())
15478 return false;
15479
15480 switch (I->getOpcode()) {
15481 case Instruction::Sub:
15482 case Instruction::Add: {
15483 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15484 return false;
15485
15486 // If the exts' operands extract either the lower or upper elements, we
15487 // can sink them too.
15488 auto Ext1 = cast<Instruction>(I->getOperand(0));
15489 auto Ext2 = cast<Instruction>(I->getOperand(1));
15490 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15491 Ops.push_back(&Ext1->getOperandUse(0));
15492 Ops.push_back(&Ext2->getOperandUse(0));
15493 }
15494
15495 Ops.push_back(&I->getOperandUse(0));
15496 Ops.push_back(&I->getOperandUse(1));
15497
15498 return true;
15499 }
15500 case Instruction::Or: {
15501 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15502 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15503 if (Subtarget->hasNEON()) {
15504 Instruction *OtherAnd, *IA, *IB;
15505 Value *MaskValue;
15506 // MainAnd refers to And instruction that has 'Not' as one of its operands
15507 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15508 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15509 m_Instruction(IA)))))) {
15510 if (match(OtherAnd,
15511 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15512 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15513 ? cast<Instruction>(I->getOperand(1))
15514 : cast<Instruction>(I->getOperand(0));
15515
15516 // Both Ands should be in same basic block as Or
15517 if (I->getParent() != MainAnd->getParent() ||
15518 I->getParent() != OtherAnd->getParent())
15519 return false;
15520
15521 // Non-mask operands of both Ands should also be in same basic block
15522 if (I->getParent() != IA->getParent() ||
15523 I->getParent() != IB->getParent())
15524 return false;
15525
15526 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15527 Ops.push_back(&I->getOperandUse(0));
15528 Ops.push_back(&I->getOperandUse(1));
15529
15530 return true;
15531 }
15532 }
15533 }
15534
15535 return false;
15536 }
15537 case Instruction::Mul: {
15538 int NumZExts = 0, NumSExts = 0;
15539 for (auto &Op : I->operands()) {
15540 // Make sure we are not already sinking this operand
15541 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15542 continue;
15543
15544 if (match(&Op, m_SExt(m_Value()))) {
15545 NumSExts++;
15546 continue;
15547 } else if (match(&Op, m_ZExt(m_Value()))) {
15548 NumZExts++;
15549 continue;
15550 }
15551
15552 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15553
15554 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15555 // operand and the s/zext can help create indexed s/umull. This is
15556 // especially useful to prevent i64 mul being scalarized.
15557 if (Shuffle && isSplatShuffle(Shuffle) &&
15558 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15559 Ops.push_back(&Shuffle->getOperandUse(0));
15560 Ops.push_back(&Op);
15561 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15562 NumSExts++;
15563 else
15564 NumZExts++;
15565 continue;
15566 }
15567
15568 if (!Shuffle)
15569 continue;
15570
15571 Value *ShuffleOperand = Shuffle->getOperand(0);
15572 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15573 if (!Insert)
15574 continue;
15575
15576 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15577 if (!OperandInstr)
15578 continue;
15579
15580 ConstantInt *ElementConstant =
15581 dyn_cast<ConstantInt>(Insert->getOperand(2));
15582 // Check that the insertelement is inserting into element 0
15583 if (!ElementConstant || !ElementConstant->isZero())
15584 continue;
15585
15586 unsigned Opcode = OperandInstr->getOpcode();
15587 if (Opcode == Instruction::SExt)
15588 NumSExts++;
15589 else if (Opcode == Instruction::ZExt)
15590 NumZExts++;
15591 else {
15592 // If we find that the top bits are known 0, then we can sink and allow
15593 // the backend to generate a umull.
15594 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15595 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15596 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15597 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15598 continue;
15599 NumZExts++;
15600 }
15601
15602 Ops.push_back(&Shuffle->getOperandUse(0));
15603 Ops.push_back(&Op);
15604 }
15605
15606 // Is it profitable to sink if we found two of the same type of extends.
15607 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15608 }
15609 default:
15610 return false;
15611 }
15612 return false;
15613}
15614
15616 bool IsLittleEndian) {
15617 Value *Op = ZExt->getOperand(0);
15618 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15619 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15620 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15621 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15622 return false;
15623
15624 assert(DstWidth % SrcWidth == 0 &&
15625 "TBL lowering is not supported for a ZExt instruction with this "
15626 "source & destination element type.");
15627 unsigned ZExtFactor = DstWidth / SrcWidth;
15628 unsigned NumElts = SrcTy->getNumElements();
15629 IRBuilder<> Builder(ZExt);
15630 SmallVector<int> Mask;
15631 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15632 // vector to replace the original ZExt. This can later be lowered to a set of
15633 // tbl instructions.
15634 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15635 if (IsLittleEndian) {
15636 if (i % ZExtFactor == 0)
15637 Mask.push_back(i / ZExtFactor);
15638 else
15639 Mask.push_back(NumElts);
15640 } else {
15641 if ((i + 1) % ZExtFactor == 0)
15642 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15643 else
15644 Mask.push_back(NumElts);
15645 }
15646 }
15647
15648 auto *FirstEltZero = Builder.CreateInsertElement(
15649 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15650 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15651 Result = Builder.CreateBitCast(Result, DstTy);
15652 if (DstTy != ZExt->getType())
15653 Result = Builder.CreateZExt(Result, ZExt->getType());
15654 ZExt->replaceAllUsesWith(Result);
15655 ZExt->eraseFromParent();
15656 return true;
15657}
15658
15659static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15660 IRBuilder<> Builder(TI);
15662 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15663 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15664 auto *DstTy = cast<FixedVectorType>(TI->getType());
15665 assert(SrcTy->getElementType()->isIntegerTy() &&
15666 "Non-integer type source vector element is not supported");
15667 assert(DstTy->getElementType()->isIntegerTy(8) &&
15668 "Unsupported destination vector element type");
15669 unsigned SrcElemTySz =
15670 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15671 unsigned DstElemTySz =
15672 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15673 assert((SrcElemTySz % DstElemTySz == 0) &&
15674 "Cannot lower truncate to tbl instructions for a source element size "
15675 "that is not divisible by the destination element size");
15676 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15677 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15678 "Unsupported source vector element type size");
15679 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15680
15681 // Create a mask to choose every nth byte from the source vector table of
15682 // bytes to create the truncated destination vector, where 'n' is the truncate
15683 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15684 // 0,8,16,..Y*8th bytes for the little-endian format
15686 for (int Itr = 0; Itr < 16; Itr++) {
15687 if (Itr < NumElements)
15688 MaskConst.push_back(Builder.getInt8(
15689 IsLittleEndian ? Itr * TruncFactor
15690 : Itr * TruncFactor + (TruncFactor - 1)));
15691 else
15692 MaskConst.push_back(Builder.getInt8(255));
15693 }
15694
15695 int MaxTblSz = 128 * 4;
15696 int MaxSrcSz = SrcElemTySz * NumElements;
15697 int ElemsPerTbl =
15698 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15699 assert(ElemsPerTbl <= 16 &&
15700 "Maximum elements selected using TBL instruction cannot exceed 16!");
15701
15702 int ShuffleCount = 128 / SrcElemTySz;
15703 SmallVector<int> ShuffleLanes;
15704 for (int i = 0; i < ShuffleCount; ++i)
15705 ShuffleLanes.push_back(i);
15706
15707 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15708 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15709 // call TBL & save the result in a vector of TBL results for combining later.
15711 while (ShuffleLanes.back() < NumElements) {
15712 Parts.push_back(Builder.CreateBitCast(
15713 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15714
15715 if (Parts.size() == 4) {
15717 Intrinsic::aarch64_neon_tbl4, VecTy);
15718 Parts.push_back(ConstantVector::get(MaskConst));
15719 Results.push_back(Builder.CreateCall(F, Parts));
15720 Parts.clear();
15721 }
15722
15723 for (int i = 0; i < ShuffleCount; ++i)
15724 ShuffleLanes[i] += ShuffleCount;
15725 }
15726
15727 assert((Parts.empty() || Results.empty()) &&
15728 "Lowering trunc for vectors requiring different TBL instructions is "
15729 "not supported!");
15730 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15731 // registers
15732 if (!Parts.empty()) {
15733 Intrinsic::ID TblID;
15734 switch (Parts.size()) {
15735 case 1:
15736 TblID = Intrinsic::aarch64_neon_tbl1;
15737 break;
15738 case 2:
15739 TblID = Intrinsic::aarch64_neon_tbl2;
15740 break;
15741 case 3:
15742 TblID = Intrinsic::aarch64_neon_tbl3;
15743 break;
15744 }
15745
15746 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15747 Parts.push_back(ConstantVector::get(MaskConst));
15748 Results.push_back(Builder.CreateCall(F, Parts));
15749 }
15750
15751 // Extract the destination vector from TBL result(s) after combining them
15752 // where applicable. Currently, at most two TBLs are supported.
15753 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15754 "more than 2 tbl instructions!");
15755 Value *FinalResult = Results[0];
15756 if (Results.size() == 1) {
15757 if (ElemsPerTbl < 16) {
15758 SmallVector<int> FinalMask(ElemsPerTbl);
15759 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15760 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15761 }
15762 } else {
15763 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15764 if (ElemsPerTbl < 16) {
15765 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15766 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15767 } else {
15768 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15769 }
15770 FinalResult =
15771 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15772 }
15773
15774 TI->replaceAllUsesWith(FinalResult);
15775 TI->eraseFromParent();
15776}
15777
15779 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15780 // shuffle_vector instructions are serialized when targeting SVE,
15781 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15782 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15783 return false;
15784
15785 // Try to optimize conversions using tbl. This requires materializing constant
15786 // index vectors, which can increase code size and add loads. Skip the
15787 // transform unless the conversion is in a loop block guaranteed to execute
15788 // and we are not optimizing for size.
15789 Function *F = I->getParent()->getParent();
15790 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15791 F->hasOptSize())
15792 return false;
15793
15794 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15795 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15796 if (!SrcTy || !DstTy)
15797 return false;
15798
15799 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15800 // lowered to tbl instructions to insert the original i8 elements
15801 // into i8x lanes. This is enabled for cases where it is beneficial.
15802 auto *ZExt = dyn_cast<ZExtInst>(I);
15803 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15804 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15805 if (DstWidth % 8 != 0)
15806 return false;
15807
15808 auto *TruncDstType =
15809 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15810 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15811 // the remaining ZExt folded into the user, don't use tbl lowering.
15812 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15813 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15816 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15817 return false;
15818
15819 DstTy = TruncDstType;
15820 }
15821
15822 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15823 }
15824
15825 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15826 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15827 DstTy->getElementType()->isFloatTy()) {
15828 IRBuilder<> Builder(I);
15829 auto *ZExt = cast<ZExtInst>(
15830 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15831 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15832 I->replaceAllUsesWith(UI);
15833 I->eraseFromParent();
15834 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15835 Subtarget->isLittleEndian());
15836 }
15837
15838 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15839 // followed by a truncate lowered to using tbl.4.
15840 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15841 if (FPToUI &&
15842 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15843 SrcTy->getElementType()->isFloatTy() &&
15844 DstTy->getElementType()->isIntegerTy(8)) {
15845 IRBuilder<> Builder(I);
15846 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15847 VectorType::getInteger(SrcTy));
15848 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15849 I->replaceAllUsesWith(TruncI);
15850 I->eraseFromParent();
15851 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15852 return true;
15853 }
15854
15855 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15856 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15857 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15858 // registers
15859 auto *TI = dyn_cast<TruncInst>(I);
15860 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15861 ((SrcTy->getElementType()->isIntegerTy(32) ||
15862 SrcTy->getElementType()->isIntegerTy(64)) &&
15863 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15864 createTblForTrunc(TI, Subtarget->isLittleEndian());
15865 return true;
15866 }
15867
15868 return false;
15869}
15870
15872 Align &RequiredAligment) const {
15873 if (!LoadedType.isSimple() ||
15874 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15875 return false;
15876 // Cyclone supports unaligned accesses.
15877 RequiredAligment = Align(1);
15878 unsigned NumBits = LoadedType.getSizeInBits();
15879 return NumBits == 32 || NumBits == 64;
15880}
15881
15882/// A helper function for determining the number of interleaved accesses we
15883/// will generate when lowering accesses of the given type.
15885 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15886 unsigned VecSize = 128;
15887 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15888 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15889 if (UseScalable && isa<FixedVectorType>(VecTy))
15890 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15891 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15892}
15893
15896 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15897 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15898 return MOStridedAccess;
15900}
15901
15903 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15904 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15905 auto EC = VecTy->getElementCount();
15906 unsigned MinElts = EC.getKnownMinValue();
15907
15908 UseScalable = false;
15909
15910 if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15911 return false;
15912
15913 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15914 return false;
15915
15916 // Ensure that the predicate for this number of elements is available.
15917 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15918 return false;
15919
15920 // Ensure the number of vector elements is greater than 1.
15921 if (MinElts < 2)
15922 return false;
15923
15924 // Ensure the element type is legal.
15925 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15926 return false;
15927
15928 if (EC.isScalable()) {
15929 UseScalable = true;
15930 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
15931 }
15932
15933 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
15934 if (!Subtarget->isNeonAvailable() ||
15935 (Subtarget->useSVEForFixedLengthVectors() &&
15936 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
15937 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15938 isPowerOf2_32(MinElts) && VecSize > 128)))) {
15939 UseScalable = true;
15940 return true;
15941 }
15942
15943 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
15944 // 128 will be split into multiple interleaved accesses.
15945 return VecSize == 64 || VecSize % 128 == 0;
15946}
15947
15949 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
15950 return ScalableVectorType::get(VTy->getElementType(), 2);
15951
15952 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
15953 return ScalableVectorType::get(VTy->getElementType(), 4);
15954
15955 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
15956 return ScalableVectorType::get(VTy->getElementType(), 8);
15957
15958 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
15959 return ScalableVectorType::get(VTy->getElementType(), 8);
15960
15961 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
15962 return ScalableVectorType::get(VTy->getElementType(), 2);
15963
15964 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
15965 return ScalableVectorType::get(VTy->getElementType(), 4);
15966
15967 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
15968 return ScalableVectorType::get(VTy->getElementType(), 8);
15969
15970 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
15971 return ScalableVectorType::get(VTy->getElementType(), 16);
15972
15973 llvm_unreachable("Cannot handle input vector type");
15974}
15975
15976static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
15977 bool Scalable, Type *LDVTy,
15978 Type *PtrTy) {
15979 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15980 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
15981 Intrinsic::aarch64_sve_ld3_sret,
15982 Intrinsic::aarch64_sve_ld4_sret};
15983 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
15984 Intrinsic::aarch64_neon_ld3,
15985 Intrinsic::aarch64_neon_ld4};
15986 if (Scalable)
15987 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
15988
15989 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
15990}
15991
15992static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
15993 bool Scalable, Type *STVTy,
15994 Type *PtrTy) {
15995 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15996 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
15997 Intrinsic::aarch64_sve_st3,
15998 Intrinsic::aarch64_sve_st4};
15999 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16000 Intrinsic::aarch64_neon_st3,
16001 Intrinsic::aarch64_neon_st4};
16002 if (Scalable)
16003 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16004
16005 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16006}
16007
16008/// Lower an interleaved load into a ldN intrinsic.
16009///
16010/// E.g. Lower an interleaved load (Factor = 2):
16011/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16012/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16013/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16014///
16015/// Into:
16016/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16017/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16018/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16021 ArrayRef<unsigned> Indices, unsigned Factor) const {
16022 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16023 "Invalid interleave factor");
16024 assert(!Shuffles.empty() && "Empty shufflevector input");
16025 assert(Shuffles.size() == Indices.size() &&
16026 "Unmatched number of shufflevectors and indices");
16027
16028 const DataLayout &DL = LI->getModule()->getDataLayout();
16029
16030 VectorType *VTy = Shuffles[0]->getType();
16031
16032 // Skip if we do not have NEON and skip illegal vector types. We can
16033 // "legalize" wide vector types into multiple interleaved accesses as long as
16034 // the vector types are divisible by 128.
16035 bool UseScalable;
16036 if (!Subtarget->hasNEON() ||
16037 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
16038 return false;
16039
16040 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16041
16042 auto *FVTy = cast<FixedVectorType>(VTy);
16043
16044 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16045 // load integer vectors first and then convert to pointer vectors.
16046 Type *EltTy = FVTy->getElementType();
16047 if (EltTy->isPointerTy())
16048 FVTy =
16049 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16050
16051 // If we're going to generate more than one load, reset the sub-vector type
16052 // to something legal.
16053 FVTy = FixedVectorType::get(FVTy->getElementType(),
16054 FVTy->getNumElements() / NumLoads);
16055
16056 auto *LDVTy =
16057 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16058
16059 IRBuilder<> Builder(LI);
16060
16061 // The base address of the load.
16062 Value *BaseAddr = LI->getPointerOperand();
16063
16064 Type *PtrTy = LI->getPointerOperandType();
16065 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16066 LDVTy->getElementCount());
16067
16068 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16069 UseScalable, LDVTy, PtrTy);
16070
16071 // Holds sub-vectors extracted from the load intrinsic return values. The
16072 // sub-vectors are associated with the shufflevector instructions they will
16073 // replace.
16075
16076 Value *PTrue = nullptr;
16077 if (UseScalable) {
16078 std::optional<unsigned> PgPattern =
16079 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16080 if (Subtarget->getMinSVEVectorSizeInBits() ==
16081 Subtarget->getMaxSVEVectorSizeInBits() &&
16082 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16083 PgPattern = AArch64SVEPredPattern::all;
16084
16085 auto *PTruePat =
16086 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16087 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16088 {PTruePat});
16089 }
16090
16091 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16092
16093 // If we're generating more than one load, compute the base address of
16094 // subsequent loads as an offset from the previous.
16095 if (LoadCount > 0)
16096 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16097 FVTy->getNumElements() * Factor);
16098
16099 CallInst *LdN;
16100 if (UseScalable)
16101 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16102 else
16103 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16104
16105 // Extract and store the sub-vectors returned by the load intrinsic.
16106 for (unsigned i = 0; i < Shuffles.size(); i++) {
16107 ShuffleVectorInst *SVI = Shuffles[i];
16108 unsigned Index = Indices[i];
16109
16110 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16111
16112 if (UseScalable)
16113 SubVec = Builder.CreateExtractVector(
16114 FVTy, SubVec,
16115 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16116
16117 // Convert the integer vector to pointer vector if the element is pointer.
16118 if (EltTy->isPointerTy())
16119 SubVec = Builder.CreateIntToPtr(
16121 FVTy->getNumElements()));
16122
16123 SubVecs[SVI].push_back(SubVec);
16124 }
16125 }
16126
16127 // Replace uses of the shufflevector instructions with the sub-vectors
16128 // returned by the load intrinsic. If a shufflevector instruction is
16129 // associated with more than one sub-vector, those sub-vectors will be
16130 // concatenated into a single wide vector.
16131 for (ShuffleVectorInst *SVI : Shuffles) {
16132 auto &SubVec = SubVecs[SVI];
16133 auto *WideVec =
16134 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16135 SVI->replaceAllUsesWith(WideVec);
16136 }
16137
16138 return true;
16139}
16140
16141template <typename Iter>
16142bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16143 int MaxLookupDist = 20;
16144 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16145 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16146 const Value *PtrA1 =
16147 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16148
16149 while (++It != End) {
16150 if (It->isDebugOrPseudoInst())
16151 continue;
16152 if (MaxLookupDist-- == 0)
16153 break;
16154 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16155 const Value *PtrB1 =
16156 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16157 DL, OffsetB);
16158 if (PtrA1 == PtrB1 &&
16159 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16160 .abs() == 16)
16161 return true;
16162 }
16163 }
16164
16165 return false;
16166}
16167
16168/// Lower an interleaved store into a stN intrinsic.
16169///
16170/// E.g. Lower an interleaved store (Factor = 3):
16171/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16172/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16173/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16174///
16175/// Into:
16176/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16177/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16178/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16179/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16180///
16181/// Note that the new shufflevectors will be removed and we'll only generate one
16182/// st3 instruction in CodeGen.
16183///
16184/// Example for a more general valid mask (Factor 3). Lower:
16185/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16186/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16187/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16188///
16189/// Into:
16190/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16191/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16192/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16193/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16195 ShuffleVectorInst *SVI,
16196 unsigned Factor) const {
16197
16198 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16199 "Invalid interleave factor");
16200
16201 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16202 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16203
16204 unsigned LaneLen = VecTy->getNumElements() / Factor;
16205 Type *EltTy = VecTy->getElementType();
16206 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16207
16208 const DataLayout &DL = SI->getModule()->getDataLayout();
16209 bool UseScalable;
16210
16211 // Skip if we do not have NEON and skip illegal vector types. We can
16212 // "legalize" wide vector types into multiple interleaved accesses as long as
16213 // the vector types are divisible by 128.
16214 if (!Subtarget->hasNEON() ||
16215 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16216 return false;
16217
16218 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16219
16220 Value *Op0 = SVI->getOperand(0);
16221 Value *Op1 = SVI->getOperand(1);
16222 IRBuilder<> Builder(SI);
16223
16224 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16225 // vectors to integer vectors.
16226 if (EltTy->isPointerTy()) {
16227 Type *IntTy = DL.getIntPtrType(EltTy);
16228 unsigned NumOpElts =
16229 cast<FixedVectorType>(Op0->getType())->getNumElements();
16230
16231 // Convert to the corresponding integer vector.
16232 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16233 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16234 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16235
16236 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16237 }
16238
16239 // If we're going to generate more than one store, reset the lane length
16240 // and sub-vector type to something legal.
16241 LaneLen /= NumStores;
16242 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16243
16244 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16245 : SubVecTy;
16246
16247 // The base address of the store.
16248 Value *BaseAddr = SI->getPointerOperand();
16249
16250 auto Mask = SVI->getShuffleMask();
16251
16252 // Sanity check if all the indices are NOT in range.
16253 // If mask is `poison`, `Mask` may be a vector of -1s.
16254 // If all of them are `poison`, OOB read will happen later.
16255 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16256 return false;
16257 }
16258 // A 64bit st2 which does not start at element 0 will involved adding extra
16259 // ext elements making the st2 unprofitable, and if there is a nearby store
16260 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16261 // zip;ldp pair which has higher throughput.
16262 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16263 (Mask[0] != 0 ||
16264 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16265 DL) ||
16266 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16267 BaseAddr, DL)))
16268 return false;
16269
16270 Type *PtrTy = SI->getPointerOperandType();
16271 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16272 STVTy->getElementCount());
16273
16274 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16275 UseScalable, STVTy, PtrTy);
16276
16277 Value *PTrue = nullptr;
16278 if (UseScalable) {
16279 std::optional<unsigned> PgPattern =
16280 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16281 if (Subtarget->getMinSVEVectorSizeInBits() ==
16282 Subtarget->getMaxSVEVectorSizeInBits() &&
16283 Subtarget->getMinSVEVectorSizeInBits() ==
16284 DL.getTypeSizeInBits(SubVecTy))
16285 PgPattern = AArch64SVEPredPattern::all;
16286
16287 auto *PTruePat =
16288 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16289 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16290 {PTruePat});
16291 }
16292
16293 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16294
16296
16297 // Split the shufflevector operands into sub vectors for the new stN call.
16298 for (unsigned i = 0; i < Factor; i++) {
16299 Value *Shuffle;
16300 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16301 if (Mask[IdxI] >= 0) {
16302 Shuffle = Builder.CreateShuffleVector(
16303 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16304 } else {
16305 unsigned StartMask = 0;
16306 for (unsigned j = 1; j < LaneLen; j++) {
16307 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16308 if (Mask[IdxJ] >= 0) {
16309 StartMask = Mask[IdxJ] - j;
16310 break;
16311 }
16312 }
16313 // Note: Filling undef gaps with random elements is ok, since
16314 // those elements were being written anyway (with undefs).
16315 // In the case of all undefs we're defaulting to using elems from 0
16316 // Note: StartMask cannot be negative, it's checked in
16317 // isReInterleaveMask
16318 Shuffle = Builder.CreateShuffleVector(
16319 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16320 }
16321
16322 if (UseScalable)
16323 Shuffle = Builder.CreateInsertVector(
16324 STVTy, UndefValue::get(STVTy), Shuffle,
16325 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16326
16327 Ops.push_back(Shuffle);
16328 }
16329
16330 if (UseScalable)
16331 Ops.push_back(PTrue);
16332
16333 // If we generating more than one store, we compute the base address of
16334 // subsequent stores as an offset from the previous.
16335 if (StoreCount > 0)
16336 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16337 BaseAddr, LaneLen * Factor);
16338
16339 Ops.push_back(BaseAddr);
16340 Builder.CreateCall(StNFunc, Ops);
16341 }
16342 return true;
16343}
16344
16346 IntrinsicInst *DI, LoadInst *LI) const {
16347 // Only deinterleave2 supported at present.
16348 if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
16349 return false;
16350
16351 // Only a factor of 2 supported at present.
16352 const unsigned Factor = 2;
16353
16354 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16355 const DataLayout &DL = DI->getModule()->getDataLayout();
16356 bool UseScalable;
16357 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16358 return false;
16359
16360 // TODO: Add support for using SVE instructions with fixed types later, using
16361 // the code from lowerInterleavedLoad to obtain the correct container type.
16362 if (UseScalable && !VTy->isScalableTy())
16363 return false;
16364
16365 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16366
16367 VectorType *LdTy =
16369 VTy->getElementCount().divideCoefficientBy(NumLoads));
16370
16371 Type *PtrTy = LI->getPointerOperandType();
16372 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16373 UseScalable, LdTy, PtrTy);
16374
16375 IRBuilder<> Builder(LI);
16376
16377 Value *Pred = nullptr;
16378 if (UseScalable)
16379 Pred =
16380 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16381
16382 Value *BaseAddr = LI->getPointerOperand();
16383 Value *Result;
16384 if (NumLoads > 1) {
16385 Value *Left = PoisonValue::get(VTy);
16387
16388 for (unsigned I = 0; I < NumLoads; ++I) {
16389 Value *Offset = Builder.getInt64(I * Factor);
16390
16391 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16392 Value *LdN = nullptr;
16393 if (UseScalable)
16394 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16395 else
16396 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16397
16398 Value *Idx =
16399 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16400 Left = Builder.CreateInsertVector(
16401 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16402 Right = Builder.CreateInsertVector(
16403 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16404 }
16405
16406 Result = PoisonValue::get(DI->getType());
16407 Result = Builder.CreateInsertValue(Result, Left, 0);
16408 Result = Builder.CreateInsertValue(Result, Right, 1);
16409 } else {
16410 if (UseScalable)
16411 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16412 else
16413 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16414 }
16415
16416 DI->replaceAllUsesWith(Result);
16417 return true;
16418}
16419
16421 IntrinsicInst *II, StoreInst *SI) const {
16422 // Only interleave2 supported at present.
16423 if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16424 return false;
16425
16426 // Only a factor of 2 supported at present.
16427 const unsigned Factor = 2;
16428
16429 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16430 const DataLayout &DL = II->getModule()->getDataLayout();
16431 bool UseScalable;
16432 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16433 return false;
16434
16435 // TODO: Add support for using SVE instructions with fixed types later, using
16436 // the code from lowerInterleavedStore to obtain the correct container type.
16437 if (UseScalable && !VTy->isScalableTy())
16438 return false;
16439
16440 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16441
16442 VectorType *StTy =
16444 VTy->getElementCount().divideCoefficientBy(NumStores));
16445
16446 Type *PtrTy = SI->getPointerOperandType();
16447 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16448 UseScalable, StTy, PtrTy);
16449
16450 IRBuilder<> Builder(SI);
16451
16452 Value *BaseAddr = SI->getPointerOperand();
16453 Value *Pred = nullptr;
16454
16455 if (UseScalable)
16456 Pred =
16457 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16458
16459 Value *L = II->getOperand(0);
16460 Value *R = II->getOperand(1);
16461
16462 for (unsigned I = 0; I < NumStores; ++I) {
16463 Value *Address = BaseAddr;
16464 if (NumStores > 1) {
16465 Value *Offset = Builder.getInt64(I * Factor);
16466 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16467
16468 Value *Idx =
16469 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16470 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16471 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16472 }
16473
16474 if (UseScalable)
16475 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16476 else
16477 Builder.CreateCall(StNFunc, {L, R, Address});
16478 }
16479
16480 return true;
16481}
16482
16484 const MemOp &Op, const AttributeList &FuncAttributes) const {
16485 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16486 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16487 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16488 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16489 // taken one instruction to materialize the v2i64 zero and one store (with
16490 // restrictive addressing mode). Just do i64 stores.
16491 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16492 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16493 if (Op.isAligned(AlignCheck))
16494 return true;
16495 unsigned Fast;
16496 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16498 Fast;
16499 };
16500
16501 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16502 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16503 return MVT::v16i8;
16504 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16505 return MVT::f128;
16506 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16507 return MVT::i64;
16508 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16509 return MVT::i32;
16510 return MVT::Other;
16511}
16512
16514 const MemOp &Op, const AttributeList &FuncAttributes) const {
16515 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16516 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16517 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16518 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16519 // taken one instruction to materialize the v2i64 zero and one store (with
16520 // restrictive addressing mode). Just do i64 stores.
16521 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16522 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16523 if (Op.isAligned(AlignCheck))
16524 return true;
16525 unsigned Fast;
16526 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16528 Fast;
16529 };
16530
16531 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16532 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16533 return LLT::fixed_vector(2, 64);
16534 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16535 return LLT::scalar(128);
16536 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16537 return LLT::scalar(64);
16538 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16539 return LLT::scalar(32);
16540 return LLT();
16541}
16542
16543// 12-bit optionally shifted immediates are legal for adds.
16545 if (Immed == std::numeric_limits<int64_t>::min()) {
16546 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16547 << ": avoid UB for INT64_MIN\n");
16548 return false;
16549 }
16550 // Same encoding for add/sub, just flip the sign.
16551 Immed = std::abs(Immed);
16552 bool IsLegal = ((Immed >> 12) == 0 ||
16553 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16554 LLVM_DEBUG(dbgs() << "Is " << Immed
16555 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16556 return IsLegal;
16557}
16558
16560 // We will only emit addvl/inc* instructions for SVE2
16561 if (!Subtarget->hasSVE2())
16562 return false;
16563
16564 // addvl's immediates are in terms of the number of bytes in a register.
16565 // Since there are 16 in the base supported size (128bits), we need to
16566 // divide the immediate by that much to give us a useful immediate to
16567 // multiply by vscale. We can't have a remainder as a result of this.
16568 if (Imm % 16 == 0)
16569 return isInt<6>(Imm / 16);
16570
16571 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16572 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16573 // of addvl as a result, so only take h|w|d into account.
16574 // Dec[h|w|d] will cover subtractions.
16575 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16576 // FIXME: Can we make use of other patterns to cover other immediates?
16577
16578 // inch|dech
16579 if (Imm % 8 == 0)
16580 return std::labs(Imm / 8) <= 16;
16581 // incw|decw
16582 if (Imm % 4 == 0)
16583 return std::labs(Imm / 4) <= 16;
16584 // incd|decd
16585 if (Imm % 2 == 0)
16586 return std::labs(Imm / 2) <= 16;
16587
16588 return false;
16589}
16590
16591// Return false to prevent folding
16592// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16593// if the folding leads to worse code.
16595 SDValue AddNode, SDValue ConstNode) const {
16596 // Let the DAGCombiner decide for vector types and large types.
16597 const EVT VT = AddNode.getValueType();
16598 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16599 return true;
16600
16601 // It is worse if c1 is legal add immediate, while c1*c2 is not
16602 // and has to be composed by at least two instructions.
16603 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16604 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16605 const int64_t C1 = C1Node->getSExtValue();
16606 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16608 return true;
16610 // Adapt to the width of a register.
16611 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16613 if (Insn.size() > 1)
16614 return false;
16615
16616 // Default to true and let the DAGCombiner decide.
16617 return true;
16618}
16619
16620// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16621// immediates is the same as for an add or a sub.
16623 return isLegalAddImmediate(Immed);
16624}
16625
16626/// isLegalAddressingMode - Return true if the addressing mode represented
16627/// by AM is legal for this target, for a load/store of the specified type.
16629 const AddrMode &AMode, Type *Ty,
16630 unsigned AS, Instruction *I) const {
16631 // AArch64 has five basic addressing modes:
16632 // reg
16633 // reg + 9-bit signed offset
16634 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16635 // reg1 + reg2
16636 // reg + SIZE_IN_BYTES * reg
16637
16638 // No global is ever allowed as a base.
16639 if (AMode.BaseGV)
16640 return false;
16641
16642 // No reg+reg+imm addressing.
16643 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16644 return false;
16645
16646 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16647 // `2*ScaledReg` into `BaseReg + ScaledReg`
16648 AddrMode AM = AMode;
16649 if (AM.Scale && !AM.HasBaseReg) {
16650 if (AM.Scale == 1) {
16651 AM.HasBaseReg = true;
16652 AM.Scale = 0;
16653 } else if (AM.Scale == 2) {
16654 AM.HasBaseReg = true;
16655 AM.Scale = 1;
16656 } else {
16657 return false;
16658 }
16659 }
16660
16661 // A base register is required in all addressing modes.
16662 if (!AM.HasBaseReg)
16663 return false;
16664
16665 if (Ty->isScalableTy()) {
16666 if (isa<ScalableVectorType>(Ty)) {
16667 // See if we have a foldable vscale-based offset, for vector types which
16668 // are either legal or smaller than the minimum; more work will be
16669 // required if we need to consider addressing for types which need
16670 // legalization by splitting.
16671 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16672 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16673 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16674 isPowerOf2_64(VecNumBytes))
16675 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
16676
16677 uint64_t VecElemNumBytes =
16678 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16679 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16680 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16681 }
16682
16683 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16684 }
16685
16686 // No scalable offsets allowed for non-scalable types.
16687 if (AM.ScalableOffset)
16688 return false;
16689
16690 // check reg + imm case:
16691 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16692 uint64_t NumBytes = 0;
16693 if (Ty->isSized()) {
16694 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16695 NumBytes = NumBits / 8;
16696 if (!isPowerOf2_64(NumBits))
16697 NumBytes = 0;
16698 }
16699
16700 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16701 AM.Scale);
16702}
16703
16704// Check whether the 2 offsets belong to the same imm24 range, and their high
16705// 12bits are same, then their high part can be decoded with the offset of add.
16706int64_t
16708 int64_t MaxOffset) const {
16709 int64_t HighPart = MinOffset & ~0xfffULL;
16710 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16711 // Rebase the value to an integer multiple of imm12.
16712 return HighPart;
16713 }
16714
16715 return 0;
16716}
16717
16719 // Consider splitting large offset of struct or array.
16720 return true;
16721}
16722
16724 const MachineFunction &MF, EVT VT) const {
16725 VT = VT.getScalarType();
16726
16727 if (!VT.isSimple())
16728 return false;
16729
16730 switch (VT.getSimpleVT().SimpleTy) {
16731 case MVT::f16:
16732 return Subtarget->hasFullFP16();
16733 case MVT::f32:
16734 case MVT::f64:
16735 return true;
16736 default:
16737 break;
16738 }
16739
16740 return false;
16741}
16742
16744 Type *Ty) const {
16745 switch (Ty->getScalarType()->getTypeID()) {
16746 case Type::FloatTyID:
16747 case Type::DoubleTyID:
16748 return true;
16749 default:
16750 return false;
16751 }
16752}
16753
16755 EVT VT, CodeGenOptLevel OptLevel) const {
16756 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16758}
16759
16760const MCPhysReg *
16762 // LR is a callee-save register, but we must treat it as clobbered by any call
16763 // site. Hence we include LR in the scratch registers, which are in turn added
16764 // as implicit-defs for stackmaps and patchpoints.
16765 static const MCPhysReg ScratchRegs[] = {
16766 AArch64::X16, AArch64::X17, AArch64::LR, 0
16767 };
16768 return ScratchRegs;
16769}
16770
16772 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16773 return RCRegs;
16774}
16775
16776bool
16778 CombineLevel Level) const {
16779 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16780 N->getOpcode() == ISD::SRL) &&
16781 "Expected shift op");
16782
16783 SDValue ShiftLHS = N->getOperand(0);
16784 EVT VT = N->getValueType(0);
16785
16786 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16787 // combine it with shift 'N' to let it be lowered to UBFX except:
16788 // ((x >> C) & mask) << C.
16789 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16790 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16791 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16792 if (isMask_64(TruncMask)) {
16793 SDValue AndLHS = ShiftLHS.getOperand(0);
16794 if (AndLHS.getOpcode() == ISD::SRL) {
16795 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16796 if (N->getOpcode() == ISD::SHL)
16797 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16798 return SRLC->getZExtValue() == SHLC->getZExtValue();
16799 return false;
16800 }
16801 }
16802 }
16803 }
16804 return true;
16805}
16806
16808 const SDNode *N) const {
16809 assert(N->getOpcode() == ISD::XOR &&
16810 (N->getOperand(0).getOpcode() == ISD::SHL ||
16811 N->getOperand(0).getOpcode() == ISD::SRL) &&
16812 "Expected XOR(SHIFT) pattern");
16813
16814 // Only commute if the entire NOT mask is a hidden shifted mask.
16815 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16816 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16817 if (XorC && ShiftC) {
16818 unsigned MaskIdx, MaskLen;
16819 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16820 unsigned ShiftAmt = ShiftC->getZExtValue();
16821 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16822 if (N->getOperand(0).getOpcode() == ISD::SHL)
16823 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16824 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16825 }
16826 }
16827
16828 return false;
16829}
16830
16832 const SDNode *N, CombineLevel Level) const {
16833 assert(((N->getOpcode() == ISD::SHL &&
16834 N->getOperand(0).getOpcode() == ISD::SRL) ||
16835 (N->getOpcode() == ISD::SRL &&
16836 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16837 "Expected shift-shift mask");
16838 // Don't allow multiuse shift folding with the same shift amount.
16839 if (!N->getOperand(0)->hasOneUse())
16840 return false;
16841
16842 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16843 EVT VT = N->getValueType(0);
16844 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16845 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16846 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16847 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16848 }
16849
16850 return true;
16851}
16852
16854 unsigned BinOpcode, EVT VT) const {
16855 return VT.isScalableVector() && isTypeLegal(VT);
16856}
16857
16859 Type *Ty) const {
16860 assert(Ty->isIntegerTy());
16861
16862 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16863 if (BitSize == 0)
16864 return false;
16865
16866 int64_t Val = Imm.getSExtValue();
16867 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16868 return true;
16869
16870 if ((int64_t)Val < 0)
16871 Val = ~Val;
16872 if (BitSize == 32)
16873 Val &= (1LL << 32) - 1;
16874
16875 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16876 // MOVZ is free so return true for one or fewer MOVK.
16877 return Shift < 3;
16878}
16879
16881 unsigned Index) const {
16883 return false;
16884
16885 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16886}
16887
16888/// Turn vector tests of the signbit in the form of:
16889/// xor (sra X, elt_size(X)-1), -1
16890/// into:
16891/// cmge X, X, #0
16893 const AArch64Subtarget *Subtarget) {
16894 EVT VT = N->getValueType(0);
16895 if (!Subtarget->hasNEON() || !VT.isVector())
16896 return SDValue();
16897
16898 // There must be a shift right algebraic before the xor, and the xor must be a
16899 // 'not' operation.
16900 SDValue Shift = N->getOperand(0);
16901 SDValue Ones = N->getOperand(1);
16902 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16904 return SDValue();
16905
16906 // The shift should be smearing the sign bit across each vector element.
16907 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16908 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16909 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16910 return SDValue();
16911
16912 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16913}
16914
16915// Given a vecreduce_add node, detect the below pattern and convert it to the
16916// node sequence with UABDL, [S|U]ADB and UADDLP.
16917//
16918// i32 vecreduce_add(
16919// v16i32 abs(
16920// v16i32 sub(
16921// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16922// =================>
16923// i32 vecreduce_add(
16924// v4i32 UADDLP(
16925// v8i16 add(
16926// v8i16 zext(
16927// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16928// v8i16 zext(
16929// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
16931 SelectionDAG &DAG) {
16932 // Assumed i32 vecreduce_add
16933 if (N->getValueType(0) != MVT::i32)
16934 return SDValue();
16935
16936 SDValue VecReduceOp0 = N->getOperand(0);
16937 unsigned Opcode = VecReduceOp0.getOpcode();
16938 // Assumed v16i32 abs
16939 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
16940 return SDValue();
16941
16942 SDValue ABS = VecReduceOp0;
16943 // Assumed v16i32 sub
16944 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
16945 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
16946 return SDValue();
16947
16948 SDValue SUB = ABS->getOperand(0);
16949 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
16950 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
16951 // Assumed v16i32 type
16952 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
16953 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
16954 return SDValue();
16955
16956 // Assumed zext or sext
16957 bool IsZExt = false;
16958 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
16959 IsZExt = true;
16960 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
16961 IsZExt = false;
16962 } else
16963 return SDValue();
16964
16965 SDValue EXT0 = SUB->getOperand(0);
16966 SDValue EXT1 = SUB->getOperand(1);
16967 // Assumed zext's operand has v16i8 type
16968 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
16969 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
16970 return SDValue();
16971
16972 // Pattern is dectected. Let's convert it to sequence of nodes.
16973 SDLoc DL(N);
16974
16975 // First, create the node pattern of UABD/SABD.
16976 SDValue UABDHigh8Op0 =
16977 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16978 DAG.getConstant(8, DL, MVT::i64));
16979 SDValue UABDHigh8Op1 =
16980 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16981 DAG.getConstant(8, DL, MVT::i64));
16982 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16983 UABDHigh8Op0, UABDHigh8Op1);
16984 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
16985
16986 // Second, create the node pattern of UABAL.
16987 SDValue UABDLo8Op0 =
16988 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16989 DAG.getConstant(0, DL, MVT::i64));
16990 SDValue UABDLo8Op1 =
16991 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16992 DAG.getConstant(0, DL, MVT::i64));
16993 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16994 UABDLo8Op0, UABDLo8Op1);
16995 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
16996 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
16997
16998 // Third, create the node of UADDLP.
16999 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17000
17001 // Fourth, create the node of VECREDUCE_ADD.
17002 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17003}
17004
17005// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17006// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17007// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17008// If we have vectors larger than v16i8 we extract v16i8 vectors,
17009// Follow the same steps above to get DOT instructions concatenate them
17010// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17012 const AArch64Subtarget *ST) {
17013 if (!ST->hasDotProd())
17015
17016 SDValue Op0 = N->getOperand(0);
17017 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17018 Op0.getValueType().getVectorElementType() != MVT::i32)
17019 return SDValue();
17020
17021 unsigned ExtOpcode = Op0.getOpcode();
17022 SDValue A = Op0;
17023 SDValue B;
17024 if (ExtOpcode == ISD::MUL) {
17025 A = Op0.getOperand(0);
17026 B = Op0.getOperand(1);
17027 if (A.getOpcode() != B.getOpcode() ||
17028 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17029 return SDValue();
17030 ExtOpcode = A.getOpcode();
17031 }
17032 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17033 return SDValue();
17034
17035 EVT Op0VT = A.getOperand(0).getValueType();
17036 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17037 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17038 if (!IsValidElementCount || !IsValidSize)
17039 return SDValue();
17040
17041 SDLoc DL(Op0);
17042 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17043 // the extend B.
17044 if (!B)
17045 B = DAG.getConstant(1, DL, Op0VT);
17046 else
17047 B = B.getOperand(0);
17048
17049 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17050 unsigned NumOfVecReduce;
17051 EVT TargetType;
17052 if (IsMultipleOf16) {
17053 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17054 TargetType = MVT::v4i32;
17055 } else {
17056 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17057 TargetType = MVT::v2i32;
17058 }
17059 auto DotOpcode =
17061 // Handle the case where we need to generate only one Dot operation.
17062 if (NumOfVecReduce == 1) {
17063 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17064 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17065 A.getOperand(0), B);
17066 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17067 }
17068 // Generate Dot instructions that are multiple of 16.
17069 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17070 SmallVector<SDValue, 4> SDotVec16;
17071 unsigned I = 0;
17072 for (; I < VecReduce16Num; I += 1) {
17073 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17074 SDValue Op0 =
17075 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17076 DAG.getConstant(I * 16, DL, MVT::i64));
17077 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17078 DAG.getConstant(I * 16, DL, MVT::i64));
17079 SDValue Dot =
17080 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17081 SDotVec16.push_back(Dot);
17082 }
17083 // Concatenate dot operations.
17084 EVT SDot16EVT =
17085 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17086 SDValue ConcatSDot16 =
17087 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17088 SDValue VecReduceAdd16 =
17089 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17090 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17091 if (VecReduce8Num == 0)
17092 return VecReduceAdd16;
17093
17094 // Generate the remainder Dot operation that is multiple of 8.
17095 SmallVector<SDValue, 4> SDotVec8;
17096 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17097 SDValue Vec8Op0 =
17098 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17099 DAG.getConstant(I * 16, DL, MVT::i64));
17100 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17101 DAG.getConstant(I * 16, DL, MVT::i64));
17102 SDValue Dot =
17103 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17104 SDValue VecReudceAdd8 =
17105 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17106 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17107 VecReudceAdd8);
17108}
17109
17110// Given an (integer) vecreduce, we know the order of the inputs does not
17111// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17112// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17113// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17115 auto DetectAddExtract = [&](SDValue A) {
17116 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17117 // UADDLP(x) if found.
17118 assert(A.getOpcode() == ISD::ADD);
17119 EVT VT = A.getValueType();
17120 SDValue Op0 = A.getOperand(0);
17121 SDValue Op1 = A.getOperand(1);
17122 if (Op0.getOpcode() != Op0.getOpcode() ||
17123 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17124 Op0.getOpcode() != ISD::SIGN_EXTEND))
17125 return SDValue();
17126 SDValue Ext0 = Op0.getOperand(0);
17127 SDValue Ext1 = Op1.getOperand(0);
17128 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17130 Ext0.getOperand(0) != Ext1.getOperand(0))
17131 return SDValue();
17132 // Check that the type is twice the add types, and the extract are from
17133 // upper/lower parts of the same source.
17135 VT.getVectorNumElements() * 2)
17136 return SDValue();
17137 if ((Ext0.getConstantOperandVal(1) != 0 ||
17139 (Ext1.getConstantOperandVal(1) != 0 ||
17141 return SDValue();
17142 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17144 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17145 };
17146
17147 if (SDValue R = DetectAddExtract(A))
17148 return R;
17149
17150 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17151 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17152 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17153 A.getOperand(1));
17154 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17155 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17156 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17157 A.getOperand(0));
17158 return SDValue();
17159}
17160
17161// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17162// UADDLV(concat), where the concat represents the 64-bit zext sources.
17164 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17165 // UADDLV(concat(zext, zext)) if found.
17166 assert(A.getOpcode() == ISD::ADD);
17167 EVT VT = A.getValueType();
17168 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17169 return SDValue();
17170 SDValue Op0 = A.getOperand(0);
17171 SDValue Op1 = A.getOperand(1);
17172 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17173 return SDValue();
17174 SDValue Ext0 = Op0.getOperand(0);
17175 SDValue Ext1 = Op1.getOperand(0);
17176 EVT ExtVT0 = Ext0.getValueType();
17177 EVT ExtVT1 = Ext1.getValueType();
17178 // Check zext VTs are the same and 64-bit length.
17179 if (ExtVT0 != ExtVT1 ||
17180 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17181 return SDValue();
17182 // Get VT for concat of zext sources.
17183 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17184 SDValue Concat =
17185 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17186
17187 switch (VT.getSimpleVT().SimpleTy) {
17188 case MVT::v2i64:
17189 case MVT::v4i32:
17190 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17191 case MVT::v8i16: {
17192 SDValue Uaddlv =
17193 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17194 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17195 }
17196 default:
17197 llvm_unreachable("Unhandled vector type");
17198 }
17199}
17200
17202 SDValue A = N->getOperand(0);
17203 if (A.getOpcode() == ISD::ADD) {
17204 if (SDValue R = performUADDVAddCombine(A, DAG))
17205 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17206 else if (SDValue R = performUADDVZextCombine(A, DAG))
17207 return R;
17208 }
17209 return SDValue();
17210}
17211
17214 const AArch64Subtarget *Subtarget) {
17215 if (DCI.isBeforeLegalizeOps())
17216 return SDValue();
17217
17218 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17219}
17220
17221SDValue
17222AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17223 SelectionDAG &DAG,
17224 SmallVectorImpl<SDNode *> &Created) const {
17226 if (isIntDivCheap(N->getValueType(0), Attr))
17227 return SDValue(N,0); // Lower SDIV as SDIV
17228
17229 EVT VT = N->getValueType(0);
17230
17231 // For scalable and fixed types, mark them as cheap so we can handle it much
17232 // later. This allows us to handle larger than legal types.
17233 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17234 return SDValue(N, 0);
17235
17236 // fold (sdiv X, pow2)
17237 if ((VT != MVT::i32 && VT != MVT::i64) ||
17238 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17239 return SDValue();
17240
17241 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17242}
17243
17244SDValue
17245AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17246 SelectionDAG &DAG,
17247 SmallVectorImpl<SDNode *> &Created) const {
17249 if (isIntDivCheap(N->getValueType(0), Attr))
17250 return SDValue(N, 0); // Lower SREM as SREM
17251
17252 EVT VT = N->getValueType(0);
17253
17254 // For scalable and fixed types, mark them as cheap so we can handle it much
17255 // later. This allows us to handle larger than legal types.
17256 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17257 return SDValue(N, 0);
17258
17259 // fold (srem X, pow2)
17260 if ((VT != MVT::i32 && VT != MVT::i64) ||
17261 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17262 return SDValue();
17263
17264 unsigned Lg2 = Divisor.countr_zero();
17265 if (Lg2 == 0)
17266 return SDValue();
17267
17268 SDLoc DL(N);
17269 SDValue N0 = N->getOperand(0);
17270 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17271 SDValue Zero = DAG.getConstant(0, DL, VT);
17272 SDValue CCVal, CSNeg;
17273 if (Lg2 == 1) {
17274 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17275 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17276 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17277
17278 Created.push_back(Cmp.getNode());
17279 Created.push_back(And.getNode());
17280 } else {
17281 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17282 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17283
17284 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17285 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17286 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17287 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17288 Negs.getValue(1));
17289
17290 Created.push_back(Negs.getNode());
17291 Created.push_back(AndPos.getNode());
17292 Created.push_back(AndNeg.getNode());
17293 }
17294
17295 return CSNeg;
17296}
17297
17298static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17299 switch(getIntrinsicID(S.getNode())) {
17300 default:
17301 break;
17302 case Intrinsic::aarch64_sve_cntb:
17303 return 8;
17304 case Intrinsic::aarch64_sve_cnth:
17305 return 16;
17306 case Intrinsic::aarch64_sve_cntw:
17307 return 32;
17308 case Intrinsic::aarch64_sve_cntd:
17309 return 64;
17310 }
17311 return {};
17312}
17313
17314/// Calculates what the pre-extend type is, based on the extension
17315/// operation node provided by \p Extend.
17316///
17317/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17318/// pre-extend type is pulled directly from the operand, while other extend
17319/// operations need a bit more inspection to get this information.
17320///
17321/// \param Extend The SDNode from the DAG that represents the extend operation
17322///
17323/// \returns The type representing the \p Extend source type, or \p MVT::Other
17324/// if no valid type can be determined
17326 switch (Extend.getOpcode()) {
17327 case ISD::SIGN_EXTEND:
17328 case ISD::ZERO_EXTEND:
17329 return Extend.getOperand(0).getValueType();
17330 case ISD::AssertSext:
17331 case ISD::AssertZext:
17333 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17334 if (!TypeNode)
17335 return MVT::Other;
17336 return TypeNode->getVT();
17337 }
17338 case ISD::AND: {
17340 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17341 if (!Constant)
17342 return MVT::Other;
17343
17344 uint32_t Mask = Constant->getZExtValue();
17345
17346 if (Mask == UCHAR_MAX)
17347 return MVT::i8;
17348 else if (Mask == USHRT_MAX)
17349 return MVT::i16;
17350 else if (Mask == UINT_MAX)
17351 return MVT::i32;
17352
17353 return MVT::Other;
17354 }
17355 default:
17356 return MVT::Other;
17357 }
17358}
17359
17360/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17361/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17362/// SExt/ZExt rather than the scalar SExt/ZExt
17364 EVT VT = BV.getValueType();
17365 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17367 return SDValue();
17368
17369 // Use the first item in the buildvector/shuffle to get the size of the
17370 // extend, and make sure it looks valid.
17371 SDValue Extend = BV->getOperand(0);
17372 unsigned ExtendOpcode = Extend.getOpcode();
17373 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17374 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17375 ExtendOpcode == ISD::AssertSext;
17376 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17377 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17378 return SDValue();
17379 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17380 // calculatePreExtendType will work without issue.
17381 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17382 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17383 return SDValue();
17384
17385 // Restrict valid pre-extend data type
17386 EVT PreExtendType = calculatePreExtendType(Extend);
17387 if (PreExtendType == MVT::Other ||
17388 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17389 return SDValue();
17390
17391 // Make sure all other operands are equally extended
17392 for (SDValue Op : drop_begin(BV->ops())) {
17393 if (Op.isUndef())
17394 continue;
17395 unsigned Opc = Op.getOpcode();
17396 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17397 Opc == ISD::AssertSext;
17398 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17399 return SDValue();
17400 }
17401
17402 SDValue NBV;
17403 SDLoc DL(BV);
17404 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17405 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17406 EVT PreExtendLegalType =
17407 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17409 for (SDValue Op : BV->ops())
17410 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17411 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17412 PreExtendLegalType));
17413 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17414 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17415 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17416 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17417 BV.getOperand(1).isUndef()
17418 ? DAG.getUNDEF(PreExtendVT)
17419 : BV.getOperand(1).getOperand(0),
17420 cast<ShuffleVectorSDNode>(BV)->getMask());
17421 }
17422 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17423}
17424
17425/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17426/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17428 // If the value type isn't a vector, none of the operands are going to be dups
17429 EVT VT = Mul->getValueType(0);
17430 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17431 return SDValue();
17432
17433 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17434 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17435
17436 // Neither operands have been changed, don't make any further changes
17437 if (!Op0 && !Op1)
17438 return SDValue();
17439
17440 SDLoc DL(Mul);
17441 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17442 Op1 ? Op1 : Mul->getOperand(1));
17443}
17444
17445// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17446// Same for other types with equivalent constants.
17448 EVT VT = N->getValueType(0);
17449 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17450 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17451 return SDValue();
17452 if (N->getOperand(0).getOpcode() != ISD::AND ||
17453 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17454 return SDValue();
17455
17456 SDValue And = N->getOperand(0);
17457 SDValue Srl = And.getOperand(0);
17458
17459 APInt V1, V2, V3;
17460 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17461 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17463 return SDValue();
17464
17465 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17466 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17467 V3 != (HalfSize - 1))
17468 return SDValue();
17469
17470 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17471 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17472 VT.getVectorElementCount() * 2);
17473
17474 SDLoc DL(N);
17475 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17476 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17477 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17478}
17479
17482 const AArch64Subtarget *Subtarget) {
17483
17484 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17485 return Ext;
17487 return Ext;
17488
17489 if (DCI.isBeforeLegalizeOps())
17490 return SDValue();
17491
17492 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17493 // and in MachineCombiner pass, add+mul will be combined into madd.
17494 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17495 SDLoc DL(N);
17496 EVT VT = N->getValueType(0);
17497 SDValue N0 = N->getOperand(0);
17498 SDValue N1 = N->getOperand(1);
17499 SDValue MulOper;
17500 unsigned AddSubOpc;
17501
17502 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17503 AddSubOpc = V->getOpcode();
17504 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17505 SDValue Opnd = V->getOperand(1);
17506 MulOper = V->getOperand(0);
17507 if (AddSubOpc == ISD::SUB)
17508 std::swap(Opnd, MulOper);
17509 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17510 return C->isOne();
17511 }
17512 return false;
17513 };
17514
17515 if (IsAddSubWith1(N0)) {
17516 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17517 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17518 }
17519
17520 if (IsAddSubWith1(N1)) {
17521 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17522 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17523 }
17524
17525 // The below optimizations require a constant RHS.
17526 if (!isa<ConstantSDNode>(N1))
17527 return SDValue();
17528
17529 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17530 const APInt &ConstValue = C->getAPIntValue();
17531
17532 // Allow the scaling to be folded into the `cnt` instruction by preventing
17533 // the scaling to be obscured here. This makes it easier to pattern match.
17534 if (IsSVECntIntrinsic(N0) ||
17535 (N0->getOpcode() == ISD::TRUNCATE &&
17536 (IsSVECntIntrinsic(N0->getOperand(0)))))
17537 if (ConstValue.sge(1) && ConstValue.sle(16))
17538 return SDValue();
17539
17540 // Multiplication of a power of two plus/minus one can be done more
17541 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17542 // future CPUs have a cheaper MADD instruction, this may need to be
17543 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17544 // 64-bit is 5 cycles, so this is always a win.
17545 // More aggressively, some multiplications N0 * C can be lowered to
17546 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17547 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17548 // TODO: lower more cases.
17549
17550 // TrailingZeroes is used to test if the mul can be lowered to
17551 // shift+add+shift.
17552 unsigned TrailingZeroes = ConstValue.countr_zero();
17553 if (TrailingZeroes) {
17554 // Conservatively do not lower to shift+add+shift if the mul might be
17555 // folded into smul or umul.
17556 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17557 isZeroExtended(N0, DAG)))
17558 return SDValue();
17559 // Conservatively do not lower to shift+add+shift if the mul might be
17560 // folded into madd or msub.
17561 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17562 N->use_begin()->getOpcode() == ISD::SUB))
17563 return SDValue();
17564 }
17565 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17566 // and shift+add+shift.
17567 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17568 unsigned ShiftAmt;
17569
17570 auto Shl = [&](SDValue N0, unsigned N1) {
17571 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17572 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17573 };
17574 auto Add = [&](SDValue N0, SDValue N1) {
17575 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17576 };
17577 auto Sub = [&](SDValue N0, SDValue N1) {
17578 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17579 };
17580 auto Negate = [&](SDValue N) {
17581 SDValue Zero = DAG.getConstant(0, DL, VT);
17582 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17583 };
17584
17585 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17586 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17587 // the (2^N - 1) can't be execused via a single instruction.
17588 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17589 unsigned BitWidth = C.getBitWidth();
17590 for (unsigned i = 1; i < BitWidth / 2; i++) {
17591 APInt Rem;
17592 APInt X(BitWidth, (1 << i) + 1);
17593 APInt::sdivrem(C, X, N, Rem);
17594 APInt NVMinus1 = N - 1;
17595 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17596 M = X;
17597 return true;
17598 }
17599 }
17600 return false;
17601 };
17602
17603 if (ConstValue.isNonNegative()) {
17604 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17605 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17606 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17607 // (mul x, (2^M + 1) * (2^N + 1))
17608 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17609 APInt SCVMinus1 = ShiftedConstValue - 1;
17610 APInt SCVPlus1 = ShiftedConstValue + 1;
17611 APInt CVPlus1 = ConstValue + 1;
17612 APInt CVM, CVN;
17613 if (SCVMinus1.isPowerOf2()) {
17614 ShiftAmt = SCVMinus1.logBase2();
17615 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17616 } else if (CVPlus1.isPowerOf2()) {
17617 ShiftAmt = CVPlus1.logBase2();
17618 return Sub(Shl(N0, ShiftAmt), N0);
17619 } else if (SCVPlus1.isPowerOf2()) {
17620 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17621 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17622 } else if (Subtarget->hasALULSLFast() &&
17623 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17624 APInt CVMMinus1 = CVM - 1;
17625 APInt CVNMinus1 = CVN - 1;
17626 unsigned ShiftM1 = CVMMinus1.logBase2();
17627 unsigned ShiftN1 = CVNMinus1.logBase2();
17628 // LSLFast implicate that Shifts <= 3 places are fast
17629 if (ShiftM1 <= 3 && ShiftN1 <= 3) {
17630 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17631 return Add(Shl(MVal, ShiftN1), MVal);
17632 }
17633 }
17634 } else {
17635 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17636 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17637 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17638 APInt SCVPlus1 = -ShiftedConstValue + 1;
17639 APInt CVNegPlus1 = -ConstValue + 1;
17640 APInt CVNegMinus1 = -ConstValue - 1;
17641 if (CVNegPlus1.isPowerOf2()) {
17642 ShiftAmt = CVNegPlus1.logBase2();
17643 return Sub(N0, Shl(N0, ShiftAmt));
17644 } else if (CVNegMinus1.isPowerOf2()) {
17645 ShiftAmt = CVNegMinus1.logBase2();
17646 return Negate(Add(Shl(N0, ShiftAmt), N0));
17647 } else if (SCVPlus1.isPowerOf2()) {
17648 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17649 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17650 }
17651 }
17652
17653 return SDValue();
17654}
17655
17657 SelectionDAG &DAG) {
17658 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17659 // optimize away operation when it's from a constant.
17660 //
17661 // The general transformation is:
17662 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17663 // AND(VECTOR_CMP(x,y), constant2)
17664 // constant2 = UNARYOP(constant)
17665
17666 // Early exit if this isn't a vector operation, the operand of the
17667 // unary operation isn't a bitwise AND, or if the sizes of the operations
17668 // aren't the same.
17669 EVT VT = N->getValueType(0);
17670 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17671 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17672 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17673 return SDValue();
17674
17675 // Now check that the other operand of the AND is a constant. We could
17676 // make the transformation for non-constant splats as well, but it's unclear
17677 // that would be a benefit as it would not eliminate any operations, just
17678 // perform one more step in scalar code before moving to the vector unit.
17679 if (BuildVectorSDNode *BV =
17680 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17681 // Bail out if the vector isn't a constant.
17682 if (!BV->isConstant())
17683 return SDValue();
17684
17685 // Everything checks out. Build up the new and improved node.
17686 SDLoc DL(N);
17687 EVT IntVT = BV->getValueType(0);
17688 // Create a new constant of the appropriate type for the transformed
17689 // DAG.
17690 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17691 // The AND node needs bitcasts to/from an integer vector type around it.
17692 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17693 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17694 N->getOperand(0)->getOperand(0), MaskConst);
17695 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17696 return Res;
17697 }
17698
17699 return SDValue();
17700}
17701
17703 const AArch64Subtarget *Subtarget) {
17704 // First try to optimize away the conversion when it's conditionally from
17705 // a constant. Vectors only.
17707 return Res;
17708
17709 EVT VT = N->getValueType(0);
17710 if (VT != MVT::f32 && VT != MVT::f64)
17711 return SDValue();
17712
17713 // Only optimize when the source and destination types have the same width.
17714 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17715 return SDValue();
17716
17717 // If the result of an integer load is only used by an integer-to-float
17718 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17719 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17720 SDValue N0 = N->getOperand(0);
17721 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17722 N0.hasOneUse() &&
17723 // Do not change the width of a volatile load.
17724 !cast<LoadSDNode>(N0)->isVolatile()) {
17725 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17726 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17727 LN0->getPointerInfo(), LN0->getAlign(),
17728 LN0->getMemOperand()->getFlags());
17729
17730 // Make sure successors of the original load stay after it by updating them
17731 // to use the new Chain.
17732 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17733
17734 unsigned Opcode =
17736 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17737 }
17738
17739 return SDValue();
17740}
17741
17742/// Fold a floating-point multiply by power of two into floating-point to
17743/// fixed-point conversion.
17746 const AArch64Subtarget *Subtarget) {
17747 if (!Subtarget->isNeonAvailable())
17748 return SDValue();
17749
17750 if (!N->getValueType(0).isSimple())
17751 return SDValue();
17752
17753 SDValue Op = N->getOperand(0);
17754 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17755 return SDValue();
17756
17757 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17758 return SDValue();
17759
17760 SDValue ConstVec = Op->getOperand(1);
17761 if (!isa<BuildVectorSDNode>(ConstVec))
17762 return SDValue();
17763
17764 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17765 uint32_t FloatBits = FloatTy.getSizeInBits();
17766 if (FloatBits != 32 && FloatBits != 64 &&
17767 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17768 return SDValue();
17769
17770 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17771 uint32_t IntBits = IntTy.getSizeInBits();
17772 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17773 return SDValue();
17774
17775 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17776 if (IntBits > FloatBits)
17777 return SDValue();
17778
17779 BitVector UndefElements;
17780 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17781 int32_t Bits = IntBits == 64 ? 64 : 32;
17782 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17783 if (C == -1 || C == 0 || C > Bits)
17784 return SDValue();
17785
17786 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17787 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17788 return SDValue();
17789
17790 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17791 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17792 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17793 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17794 return SDValue();
17795 }
17796
17797 SDLoc DL(N);
17798 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17799 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17800 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17801 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17802 SDValue FixConv =
17804 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17805 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17806 // We can handle smaller integers by generating an extra trunc.
17807 if (IntBits < FloatBits)
17808 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17809
17810 return FixConv;
17811}
17812
17813/// Fold a floating-point divide by power of two into fixed-point to
17814/// floating-point conversion.
17817 const AArch64Subtarget *Subtarget) {
17818 if (!Subtarget->hasNEON())
17819 return SDValue();
17820
17821 SDValue Op = N->getOperand(0);
17822 unsigned Opc = Op->getOpcode();
17823 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17824 !Op.getOperand(0).getValueType().isSimple() ||
17825 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17826 return SDValue();
17827
17828 SDValue ConstVec = N->getOperand(1);
17829 if (!isa<BuildVectorSDNode>(ConstVec))
17830 return SDValue();
17831
17832 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17833 int32_t IntBits = IntTy.getSizeInBits();
17834 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17835 return SDValue();
17836
17837 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17838 int32_t FloatBits = FloatTy.getSizeInBits();
17839 if (FloatBits != 32 && FloatBits != 64)
17840 return SDValue();
17841
17842 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17843 if (IntBits > FloatBits)
17844 return SDValue();
17845
17846 BitVector UndefElements;
17847 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17848 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17849 if (C == -1 || C == 0 || C > FloatBits)
17850 return SDValue();
17851
17852 MVT ResTy;
17853 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17854 switch (NumLanes) {
17855 default:
17856 return SDValue();
17857 case 2:
17858 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17859 break;
17860 case 4:
17861 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17862 break;
17863 }
17864
17865 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17866 return SDValue();
17867
17868 SDLoc DL(N);
17869 SDValue ConvInput = Op.getOperand(0);
17870 bool IsSigned = Opc == ISD::SINT_TO_FP;
17871 if (IntBits < FloatBits)
17872 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17873 ResTy, ConvInput);
17874
17875 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17876 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17877 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17878 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17879 DAG.getConstant(C, DL, MVT::i32));
17880}
17881
17883 const AArch64TargetLowering &TLI) {
17884 EVT VT = N->getValueType(0);
17885 SelectionDAG &DAG = DCI.DAG;
17886 SDLoc DL(N);
17887 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17888
17889 if (!VT.isVector())
17890 return SDValue();
17891
17892 // The combining code works for NEON, SVE2 and SME.
17893 if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
17894 (VT.isScalableVector() && !Subtarget.hasSVE2()))
17895 return SDValue();
17896
17897 SDValue N0 = N->getOperand(0);
17898 if (N0.getOpcode() != ISD::AND)
17899 return SDValue();
17900
17901 SDValue N1 = N->getOperand(1);
17902 if (N1.getOpcode() != ISD::AND)
17903 return SDValue();
17904
17905 // InstCombine does (not (neg a)) => (add a -1).
17906 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17907 // Loop over all combinations of AND operands.
17908 for (int i = 1; i >= 0; --i) {
17909 for (int j = 1; j >= 0; --j) {
17910 SDValue O0 = N0->getOperand(i);
17911 SDValue O1 = N1->getOperand(j);
17912 SDValue Sub, Add, SubSibling, AddSibling;
17913
17914 // Find a SUB and an ADD operand, one from each AND.
17915 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17916 Sub = O0;
17917 Add = O1;
17918 SubSibling = N0->getOperand(1 - i);
17919 AddSibling = N1->getOperand(1 - j);
17920 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17921 Add = O0;
17922 Sub = O1;
17923 AddSibling = N0->getOperand(1 - i);
17924 SubSibling = N1->getOperand(1 - j);
17925 } else
17926 continue;
17927
17929 continue;
17930
17931 // Constant ones is always righthand operand of the Add.
17932 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
17933 continue;
17934
17935 if (Sub.getOperand(1) != Add.getOperand(0))
17936 continue;
17937
17938 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
17939 }
17940 }
17941
17942 // (or (and a b) (and (not a) c)) => (bsl a b c)
17943 // We only have to look for constant vectors here since the general, variable
17944 // case can be handled in TableGen.
17945 unsigned Bits = VT.getScalarSizeInBits();
17946 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
17947 for (int i = 1; i >= 0; --i)
17948 for (int j = 1; j >= 0; --j) {
17949 APInt Val1, Val2;
17950
17951 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
17953 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
17954 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
17955 N0->getOperand(1 - i), N1->getOperand(1 - j));
17956 }
17957 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
17958 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
17959 if (!BVN0 || !BVN1)
17960 continue;
17961
17962 bool FoundMatch = true;
17963 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
17964 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
17965 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
17966 if (!CN0 || !CN1 ||
17967 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
17968 FoundMatch = false;
17969 break;
17970 }
17971 }
17972 if (FoundMatch)
17973 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
17974 N0->getOperand(1 - i), N1->getOperand(1 - j));
17975 }
17976
17977 return SDValue();
17978}
17979
17980// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
17981// convert to csel(ccmp(.., cc0)), depending on cc1:
17982
17983// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17984// =>
17985// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
17986//
17987// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17988// =>
17989// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
17991 EVT VT = N->getValueType(0);
17992 SDValue CSel0 = N->getOperand(0);
17993 SDValue CSel1 = N->getOperand(1);
17994
17995 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
17996 CSel1.getOpcode() != AArch64ISD::CSEL)
17997 return SDValue();
17998
17999 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18000 return SDValue();
18001
18002 if (!isNullConstant(CSel0.getOperand(0)) ||
18003 !isOneConstant(CSel0.getOperand(1)) ||
18004 !isNullConstant(CSel1.getOperand(0)) ||
18005 !isOneConstant(CSel1.getOperand(1)))
18006 return SDValue();
18007
18008 SDValue Cmp0 = CSel0.getOperand(3);
18009 SDValue Cmp1 = CSel1.getOperand(3);
18012 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18013 return SDValue();
18014 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18015 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18016 std::swap(Cmp0, Cmp1);
18017 std::swap(CC0, CC1);
18018 }
18019
18020 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18021 return SDValue();
18022
18023 SDLoc DL(N);
18024 SDValue CCmp, Condition;
18025 unsigned NZCV;
18026
18027 if (N->getOpcode() == ISD::AND) {
18029 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18031 } else {
18033 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18035 }
18036
18037 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18038
18039 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18040 if (Op1 && Op1->getAPIntValue().isNegative() &&
18041 Op1->getAPIntValue().sgt(-32)) {
18042 // CCMP accept the constant int the range [0, 31]
18043 // if the Op1 is a constant in the range [-31, -1], we
18044 // can select to CCMN to avoid the extra mov
18045 SDValue AbsOp1 =
18046 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18047 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18048 NZCVOp, Condition, Cmp0);
18049 } else {
18050 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18051 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18052 }
18053 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18054 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18055 CCmp);
18056}
18057
18059 const AArch64Subtarget *Subtarget,
18060 const AArch64TargetLowering &TLI) {
18061 SelectionDAG &DAG = DCI.DAG;
18062 EVT VT = N->getValueType(0);
18063
18064 if (SDValue R = performANDORCSELCombine(N, DAG))
18065 return R;
18066
18067 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18068 return SDValue();
18069
18070 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18071 return Res;
18072
18073 return SDValue();
18074}
18075
18077 if (!MemVT.getVectorElementType().isSimple())
18078 return false;
18079
18080 uint64_t MaskForTy = 0ull;
18081 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18082 case MVT::i8:
18083 MaskForTy = 0xffull;
18084 break;
18085 case MVT::i16:
18086 MaskForTy = 0xffffull;
18087 break;
18088 case MVT::i32:
18089 MaskForTy = 0xffffffffull;
18090 break;
18091 default:
18092 return false;
18093 break;
18094 }
18095
18096 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18097 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18098 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18099
18100 return false;
18101}
18102
18104 SDValue LeafOp = SDValue(N, 0);
18105 SDValue Op = N->getOperand(0);
18106 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18107 LeafOp.getValueType() != Op.getValueType())
18108 Op = Op->getOperand(0);
18109 if (LeafOp.getValueType() == Op.getValueType())
18110 return Op;
18111 return SDValue();
18112}
18113
18116 SelectionDAG &DAG = DCI.DAG;
18117 SDValue Src = N->getOperand(0);
18118 unsigned Opc = Src->getOpcode();
18119
18120 // Zero/any extend of an unsigned unpack
18121 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18122 SDValue UnpkOp = Src->getOperand(0);
18123 SDValue Dup = N->getOperand(1);
18124
18125 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18126 return SDValue();
18127
18128 SDLoc DL(N);
18129 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18130 if (!C)
18131 return SDValue();
18132
18133 uint64_t ExtVal = C->getZExtValue();
18134
18135 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18136 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18137 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18138 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18139 };
18140
18141 // If the mask is fully covered by the unpack, we don't need to push
18142 // a new AND onto the operand
18143 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18144 if (MaskAndTypeMatch(EltTy))
18145 return Src;
18146
18147 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18148 // to see if the mask is all-ones of size MemTy.
18149 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18150 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18151 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18152 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18153 if (MaskAndTypeMatch(EltTy))
18154 return Src;
18155 }
18156
18157 // Truncate to prevent a DUP with an over wide constant
18158 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18159
18160 // Otherwise, make sure we propagate the AND to the operand
18161 // of the unpack
18162 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18163 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18164
18165 SDValue And = DAG.getNode(ISD::AND, DL,
18166 UnpkOp->getValueType(0), UnpkOp, Dup);
18167
18168 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18169 }
18170
18171 if (DCI.isBeforeLegalizeOps())
18172 return SDValue();
18173
18174 // If both sides of AND operations are i1 splat_vectors then
18175 // we can produce just i1 splat_vector as the result.
18176 if (isAllActivePredicate(DAG, N->getOperand(0)))
18177 return N->getOperand(1);
18178 if (isAllActivePredicate(DAG, N->getOperand(1)))
18179 return N->getOperand(0);
18180
18182 return SDValue();
18183
18184 SDValue Mask = N->getOperand(1);
18185
18186 if (!Src.hasOneUse())
18187 return SDValue();
18188
18189 EVT MemVT;
18190
18191 // SVE load instructions perform an implicit zero-extend, which makes them
18192 // perfect candidates for combining.
18193 switch (Opc) {
18197 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18198 break;
18214 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18215 break;
18216 default:
18217 return SDValue();
18218 }
18219
18220 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18221 return Src;
18222
18223 return SDValue();
18224}
18225
18226// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18229
18230 // This function performs an optimization on a specific pattern involving
18231 // an AND operation and SETCC (Set Condition Code) node.
18232
18233 SDValue SetCC = N->getOperand(0);
18234 EVT VT = N->getValueType(0);
18235 SelectionDAG &DAG = DCI.DAG;
18236
18237 // Checks if the current node (N) is used by any SELECT instruction and
18238 // returns an empty SDValue to avoid applying the optimization to prevent
18239 // incorrect results
18240 for (auto U : N->uses())
18241 if (U->getOpcode() == ISD::SELECT)
18242 return SDValue();
18243
18244 // Check if the operand is a SETCC node with floating-point comparison
18245 if (SetCC.getOpcode() == ISD::SETCC &&
18246 SetCC.getOperand(0).getValueType() == MVT::f32) {
18247
18248 SDValue Cmp;
18250
18251 // Check if the DAG is after legalization and if we can emit the conjunction
18252 if (!DCI.isBeforeLegalize() &&
18253 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18254
18256
18257 SDLoc DL(N);
18258 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18259 DAG.getConstant(0, DL, VT),
18260 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18261 }
18262 }
18263 return SDValue();
18264}
18265
18268 SelectionDAG &DAG = DCI.DAG;
18269 SDValue LHS = N->getOperand(0);
18270 SDValue RHS = N->getOperand(1);
18271 EVT VT = N->getValueType(0);
18272
18273 if (SDValue R = performANDORCSELCombine(N, DAG))
18274 return R;
18275
18276 if (SDValue R = performANDSETCCCombine(N,DCI))
18277 return R;
18278
18279 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18280 return SDValue();
18281
18282 if (VT.isScalableVector())
18283 return performSVEAndCombine(N, DCI);
18284
18285 // The combining code below works only for NEON vectors. In particular, it
18286 // does not work for SVE when dealing with vectors wider than 128 bits.
18287 if (!VT.is64BitVector() && !VT.is128BitVector())
18288 return SDValue();
18289
18290 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18291 if (!BVN)
18292 return SDValue();
18293
18294 // AND does not accept an immediate, so check if we can use a BIC immediate
18295 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18296 // pattern in isel, because some immediates may be lowered to the preferred
18297 // (and x, (movi imm)) form, even though an mvni representation also exists.
18298 APInt DefBits(VT.getSizeInBits(), 0);
18299 APInt UndefBits(VT.getSizeInBits(), 0);
18300 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18301 SDValue NewOp;
18302
18303 // Any bits known to already be 0 need not be cleared again, which can help
18304 // reduce the size of the immediate to one supported by the instruction.
18305 KnownBits Known = DAG.computeKnownBits(LHS);
18306 APInt ZeroSplat(VT.getSizeInBits(), 0);
18307 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18308 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18309 << (Known.Zero.getBitWidth() * I);
18310
18311 DefBits = ~(DefBits | ZeroSplat);
18312 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18313 DefBits, &LHS)) ||
18314 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18315 DefBits, &LHS)))
18316 return NewOp;
18317
18318 UndefBits = ~(UndefBits | ZeroSplat);
18319 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18320 UndefBits, &LHS)) ||
18321 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18322 UndefBits, &LHS)))
18323 return NewOp;
18324 }
18325
18326 return SDValue();
18327}
18328
18331 SelectionDAG &DAG = DCI.DAG;
18332 SDValue LHS = N->getOperand(0);
18333 SDValue RHS = N->getOperand(1);
18334 EVT VT = N->getValueType(0);
18335 SDLoc DL(N);
18336
18337 if (!N->getFlags().hasAllowReassociation())
18338 return SDValue();
18339
18340 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18341 auto ReassocComplex = [&](SDValue A, SDValue B) {
18342 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18343 return SDValue();
18344 unsigned Opc = A.getConstantOperandVal(0);
18345 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18346 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18347 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18348 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18349 return SDValue();
18350 SDValue VCMLA = DAG.getNode(
18351 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18352 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18353 A.getOperand(2), A.getOperand(3));
18354 VCMLA->setFlags(A->getFlags());
18355 return VCMLA;
18356 };
18357 if (SDValue R = ReassocComplex(LHS, RHS))
18358 return R;
18359 if (SDValue R = ReassocComplex(RHS, LHS))
18360 return R;
18361
18362 return SDValue();
18363}
18364
18365static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18366 switch (Opcode) {
18367 case ISD::STRICT_FADD:
18368 case ISD::FADD:
18369 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18370 case ISD::ADD:
18371 return VT == MVT::i64;
18372 default:
18373 return false;
18374 }
18375}
18376
18377static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18379
18381 if ((N.getOpcode() == ISD::SETCC) ||
18382 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18383 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18384 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18385 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18386 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18387 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18388 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18389 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18390 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18391 // get_active_lane_mask is lowered to a whilelo instruction.
18392 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18393 return true;
18394
18395 return false;
18396}
18397
18398// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18399// ... into: "ptrue p, all" + PTEST
18400static SDValue
18403 const AArch64Subtarget *Subtarget) {
18404 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18405 // Make sure PTEST can be legalised with illegal types.
18406 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18407 return SDValue();
18408
18409 SDValue N0 = N->getOperand(0);
18410 EVT VT = N0.getValueType();
18411
18412 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18413 !isNullConstant(N->getOperand(1)))
18414 return SDValue();
18415
18416 // Restricted the DAG combine to only cases where we're extracting from a
18417 // flag-setting operation.
18418 if (!isPredicateCCSettingOp(N0))
18419 return SDValue();
18420
18421 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18422 SelectionDAG &DAG = DCI.DAG;
18423 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18424 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18425}
18426
18427// Materialize : Idx = (add (mul vscale, NumEls), -1)
18428// i1 = extract_vector_elt t37, Constant:i64<Idx>
18429// ... into: "ptrue p, all" + PTEST
18430static SDValue
18433 const AArch64Subtarget *Subtarget) {
18434 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18435 // Make sure PTEST is legal types.
18436 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18437 return SDValue();
18438
18439 SDValue N0 = N->getOperand(0);
18440 EVT OpVT = N0.getValueType();
18441
18442 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18443 return SDValue();
18444
18445 // Idx == (add (mul vscale, NumEls), -1)
18446 SDValue Idx = N->getOperand(1);
18447 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18448 return SDValue();
18449
18450 SDValue VS = Idx.getOperand(0);
18451 if (VS.getOpcode() != ISD::VSCALE)
18452 return SDValue();
18453
18454 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18455 if (VS.getConstantOperandVal(0) != NumEls)
18456 return SDValue();
18457
18458 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18459 SelectionDAG &DAG = DCI.DAG;
18460 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18461 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18462}
18463
18464static SDValue
18466 const AArch64Subtarget *Subtarget) {
18467 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18468 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18469 return Res;
18470 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18471 return Res;
18472
18473 SelectionDAG &DAG = DCI.DAG;
18474 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18475
18476 EVT VT = N->getValueType(0);
18477 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18478 bool IsStrict = N0->isStrictFPOpcode();
18479
18480 // extract(dup x) -> x
18481 if (N0.getOpcode() == AArch64ISD::DUP)
18482 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18483 : N0.getOperand(0);
18484
18485 // Rewrite for pairwise fadd pattern
18486 // (f32 (extract_vector_elt
18487 // (fadd (vXf32 Other)
18488 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18489 // ->
18490 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18491 // (extract_vector_elt (vXf32 Other) 1))
18492 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18493 // we can only do this when it's used only by the extract_vector_elt.
18494 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18495 (!IsStrict || N0.hasOneUse())) {
18496 SDLoc DL(N0);
18497 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18498 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18499
18500 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18501 SDValue Other = N00;
18502
18503 // And handle the commutative case.
18504 if (!Shuffle) {
18505 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18506 Other = N01;
18507 }
18508
18509 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18510 Other == Shuffle->getOperand(0)) {
18511 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18512 DAG.getConstant(0, DL, MVT::i64));
18513 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18514 DAG.getConstant(1, DL, MVT::i64));
18515 if (!IsStrict)
18516 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18517
18518 // For strict_fadd we need uses of the final extract_vector to be replaced
18519 // with the strict_fadd, but we also need uses of the chain output of the
18520 // original strict_fadd to use the chain output of the new strict_fadd as
18521 // otherwise it may not be deleted.
18522 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18523 {VT, MVT::Other},
18524 {N0->getOperand(0), Extract1, Extract2});
18525 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18526 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18527 return SDValue(N, 0);
18528 }
18529 }
18530
18531 return SDValue();
18532}
18533
18536 SelectionDAG &DAG) {
18537 SDLoc dl(N);
18538 EVT VT = N->getValueType(0);
18539 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18540 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18541
18542 if (VT.isScalableVector())
18543 return SDValue();
18544
18545 // Optimize concat_vectors of truncated vectors, where the intermediate
18546 // type is illegal, to avoid said illegality, e.g.,
18547 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18548 // (v2i16 (truncate (v2i64)))))
18549 // ->
18550 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18551 // (v4i32 (bitcast (v2i64))),
18552 // <0, 2, 4, 6>)))
18553 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18554 // on both input and result type, so we might generate worse code.
18555 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18556 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18557 N1Opc == ISD::TRUNCATE) {
18558 SDValue N00 = N0->getOperand(0);
18559 SDValue N10 = N1->getOperand(0);
18560 EVT N00VT = N00.getValueType();
18561
18562 if (N00VT == N10.getValueType() &&
18563 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18564 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18565 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18567 for (size_t i = 0; i < Mask.size(); ++i)
18568 Mask[i] = i * 2;
18569 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18570 DAG.getVectorShuffle(
18571 MidVT, dl,
18572 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18573 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18574 }
18575 }
18576
18577 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18578 N->getOperand(0).getValueType() == MVT::v2i16 ||
18579 N->getOperand(0).getValueType() == MVT::v2i8) {
18580 EVT SrcVT = N->getOperand(0).getValueType();
18581 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18582 // loads to prevent having to go through the v4i8 load legalization that
18583 // needs to extend each element into a larger type.
18584 if (N->getNumOperands() % 2 == 0 &&
18585 all_of(N->op_values(), [SrcVT](SDValue V) {
18586 if (V.getValueType() != SrcVT)
18587 return false;
18588 if (V.isUndef())
18589 return true;
18590 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18591 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18592 LD->getExtensionType() == ISD::NON_EXTLOAD;
18593 })) {
18594 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18595 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
18597
18598 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18599 SDValue V = N->getOperand(i);
18600 if (V.isUndef())
18601 Ops.push_back(DAG.getUNDEF(FVT));
18602 else {
18603 LoadSDNode *LD = cast<LoadSDNode>(V);
18604 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
18605 LD->getBasePtr(), LD->getMemOperand());
18606 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18607 Ops.push_back(NewLoad);
18608 }
18609 }
18610 return DAG.getBitcast(N->getValueType(0),
18611 DAG.getBuildVector(NVT, dl, Ops));
18612 }
18613 }
18614
18615 // Canonicalise concat_vectors to replace concatenations of truncated nots
18616 // with nots of concatenated truncates. This in some cases allows for multiple
18617 // redundant negations to be eliminated.
18618 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18619 // (v4i16 (truncate (not (v4i32)))))
18620 // ->
18621 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18622 // (v4i16 (truncate (v4i32)))))
18623 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18624 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18625 N->isOnlyUserOf(N1.getNode())) {
18626 auto isBitwiseVectorNegate = [](SDValue V) {
18627 return V->getOpcode() == ISD::XOR &&
18628 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18629 };
18630 SDValue N00 = N0->getOperand(0);
18631 SDValue N10 = N1->getOperand(0);
18632 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18633 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18634 return DAG.getNOT(
18635 dl,
18636 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18637 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18638 N00->getOperand(0)),
18639 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18640 N10->getOperand(0))),
18641 VT);
18642 }
18643 }
18644
18645 // Wait till after everything is legalized to try this. That way we have
18646 // legal vector types and such.
18647 if (DCI.isBeforeLegalizeOps())
18648 return SDValue();
18649
18650 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
18651 // destination size, combine into an avg of two contacts of the source
18652 // vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
18653 // concat(b, d))
18654 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18655 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
18656 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
18657 N0->hasOneUse() && N1->hasOneUse()) {
18658 SDValue N00 = N0->getOperand(0);
18659 SDValue N01 = N0->getOperand(1);
18660 SDValue N10 = N1->getOperand(0);
18661 SDValue N11 = N1->getOperand(1);
18662
18663 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18664 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18665 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18666 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18667 }
18668 }
18669
18670 auto IsRSHRN = [](SDValue Shr) {
18671 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18672 return false;
18673 SDValue Op = Shr.getOperand(0);
18674 EVT VT = Op.getValueType();
18675 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18676 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18677 return false;
18678
18679 APInt Imm;
18680 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18681 Imm = APInt(VT.getScalarSizeInBits(),
18682 Op.getOperand(1).getConstantOperandVal(0)
18683 << Op.getOperand(1).getConstantOperandVal(1));
18684 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18685 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18686 Imm = APInt(VT.getScalarSizeInBits(),
18687 Op.getOperand(1).getConstantOperandVal(0));
18688 else
18689 return false;
18690
18691 if (Imm != 1ULL << (ShtAmt - 1))
18692 return false;
18693 return true;
18694 };
18695
18696 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18697 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18698 ((IsRSHRN(N1) &&
18700 N1.isUndef())) {
18701 SDValue X = N0.getOperand(0).getOperand(0);
18702 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18703 : N1.getOperand(0).getOperand(0);
18704 EVT BVT =
18705 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18706 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18707 SDValue Add = DAG.getNode(
18708 ISD::ADD, dl, BVT, CC,
18709 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18710 SDValue Shr =
18711 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18712 return Shr;
18713 }
18714
18715 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18716 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18717 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18718 N0.getOperand(1) == N1.getOperand(1)) {
18719 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18720 DAG.getUNDEF(N0.getValueType()));
18721 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18722 DAG.getUNDEF(N0.getValueType()));
18723 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18724 }
18725
18726 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18727 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18728 // canonicalise to that.
18729 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18730 assert(VT.getScalarSizeInBits() == 64);
18731 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18732 DAG.getConstant(0, dl, MVT::i64));
18733 }
18734
18735 // Canonicalise concat_vectors so that the right-hand vector has as few
18736 // bit-casts as possible before its real operation. The primary matching
18737 // destination for these operations will be the narrowing "2" instructions,
18738 // which depend on the operation being performed on this right-hand vector.
18739 // For example,
18740 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18741 // becomes
18742 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18743
18744 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18745 return SDValue();
18746 SDValue RHS = N1->getOperand(0);
18747 MVT RHSTy = RHS.getValueType().getSimpleVT();
18748 // If the RHS is not a vector, this is not the pattern we're looking for.
18749 if (!RHSTy.isVector())
18750 return SDValue();
18751
18752 LLVM_DEBUG(
18753 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18754
18755 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18756 RHSTy.getVectorNumElements() * 2);
18757 return DAG.getNode(ISD::BITCAST, dl, VT,
18758 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18759 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18760 RHS));
18761}
18762
18763static SDValue
18765 SelectionDAG &DAG) {
18766 if (DCI.isBeforeLegalizeOps())
18767 return SDValue();
18768
18769 EVT VT = N->getValueType(0);
18770 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18771 return SDValue();
18772
18773 SDValue V = N->getOperand(0);
18774
18775 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18776 // blocks this combine because the non-const case requires custom lowering.
18777 //
18778 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18779 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18780 if (isa<ConstantSDNode>(V.getOperand(0)))
18781 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18782
18783 return SDValue();
18784}
18785
18786static SDValue
18788 SelectionDAG &DAG) {
18789 SDLoc DL(N);
18790 SDValue Vec = N->getOperand(0);
18791 SDValue SubVec = N->getOperand(1);
18792 uint64_t IdxVal = N->getConstantOperandVal(2);
18793 EVT VecVT = Vec.getValueType();
18794 EVT SubVT = SubVec.getValueType();
18795
18796 // Only do this for legal fixed vector types.
18797 if (!VecVT.isFixedLengthVector() ||
18798 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18799 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18800 return SDValue();
18801
18802 // Ignore widening patterns.
18803 if (IdxVal == 0 && Vec.isUndef())
18804 return SDValue();
18805
18806 // Subvector must be half the width and an "aligned" insertion.
18807 unsigned NumSubElts = SubVT.getVectorNumElements();
18808 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18809 (IdxVal != 0 && IdxVal != NumSubElts))
18810 return SDValue();
18811
18812 // Fold insert_subvector -> concat_vectors
18813 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18814 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18815 SDValue Lo, Hi;
18816 if (IdxVal == 0) {
18817 Lo = SubVec;
18818 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18819 DAG.getVectorIdxConstant(NumSubElts, DL));
18820 } else {
18821 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18822 DAG.getVectorIdxConstant(0, DL));
18823 Hi = SubVec;
18824 }
18825 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18826}
18827
18830 SelectionDAG &DAG) {
18831 // Wait until after everything is legalized to try this. That way we have
18832 // legal vector types and such.
18833 if (DCI.isBeforeLegalizeOps())
18834 return SDValue();
18835 // Transform a scalar conversion of a value from a lane extract into a
18836 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18837 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18838 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18839 //
18840 // The second form interacts better with instruction selection and the
18841 // register allocator to avoid cross-class register copies that aren't
18842 // coalescable due to a lane reference.
18843
18844 // Check the operand and see if it originates from a lane extract.
18845 SDValue Op1 = N->getOperand(1);
18847 return SDValue();
18848
18849 // Yep, no additional predication needed. Perform the transform.
18850 SDValue IID = N->getOperand(0);
18851 SDValue Shift = N->getOperand(2);
18852 SDValue Vec = Op1.getOperand(0);
18853 SDValue Lane = Op1.getOperand(1);
18854 EVT ResTy = N->getValueType(0);
18855 EVT VecResTy;
18856 SDLoc DL(N);
18857
18858 // The vector width should be 128 bits by the time we get here, even
18859 // if it started as 64 bits (the extract_vector handling will have
18860 // done so). Bail if it is not.
18861 if (Vec.getValueSizeInBits() != 128)
18862 return SDValue();
18863
18864 if (Vec.getValueType() == MVT::v4i32)
18865 VecResTy = MVT::v4f32;
18866 else if (Vec.getValueType() == MVT::v2i64)
18867 VecResTy = MVT::v2f64;
18868 else
18869 return SDValue();
18870
18871 SDValue Convert =
18872 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18873 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18874}
18875
18876// AArch64 high-vector "long" operations are formed by performing the non-high
18877// version on an extract_subvector of each operand which gets the high half:
18878//
18879// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18880//
18881// However, there are cases which don't have an extract_high explicitly, but
18882// have another operation that can be made compatible with one for free. For
18883// example:
18884//
18885// (dupv64 scalar) --> (extract_high (dup128 scalar))
18886//
18887// This routine does the actual conversion of such DUPs, once outer routines
18888// have determined that everything else is in order.
18889// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18890// similarly here.
18892 MVT VT = N.getSimpleValueType();
18893 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18894 N.getConstantOperandVal(1) == 0)
18895 N = N.getOperand(0);
18896
18897 switch (N.getOpcode()) {
18898 case AArch64ISD::DUP:
18903 case AArch64ISD::MOVI:
18909 break;
18910 default:
18911 // FMOV could be supported, but isn't very useful, as it would only occur
18912 // if you passed a bitcast' floating point immediate to an eligible long
18913 // integer op (addl, smull, ...).
18914 return SDValue();
18915 }
18916
18917 if (!VT.is64BitVector())
18918 return SDValue();
18919
18920 SDLoc DL(N);
18921 unsigned NumElems = VT.getVectorNumElements();
18922 if (N.getValueType().is64BitVector()) {
18923 MVT ElementTy = VT.getVectorElementType();
18924 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
18925 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
18926 }
18927
18928 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18929 DAG.getConstant(NumElems, DL, MVT::i64));
18930}
18931
18933 if (N.getOpcode() == ISD::BITCAST)
18934 N = N.getOperand(0);
18935 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18936 return false;
18937 if (N.getOperand(0).getValueType().isScalableVector())
18938 return false;
18939 return N.getConstantOperandAPInt(1) ==
18940 N.getOperand(0).getValueType().getVectorNumElements() / 2;
18941}
18942
18943/// Helper structure to keep track of ISD::SET_CC operands.
18948};
18949
18950/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
18952 const SDValue *Cmp;
18954};
18955
18956/// Helper structure to keep track of SetCC information.
18960};
18961
18962/// Helper structure to be able to read SetCC information. If set to
18963/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
18964/// GenericSetCCInfo.
18968};
18969
18970/// Check whether or not \p Op is a SET_CC operation, either a generic or
18971/// an
18972/// AArch64 lowered one.
18973/// \p SetCCInfo is filled accordingly.
18974/// \post SetCCInfo is meanginfull only when this function returns true.
18975/// \return True when Op is a kind of SET_CC operation.
18977 // If this is a setcc, this is straight forward.
18978 if (Op.getOpcode() == ISD::SETCC) {
18979 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
18980 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
18981 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18982 SetCCInfo.IsAArch64 = false;
18983 return true;
18984 }
18985 // Otherwise, check if this is a matching csel instruction.
18986 // In other words:
18987 // - csel 1, 0, cc
18988 // - csel 0, 1, !cc
18989 if (Op.getOpcode() != AArch64ISD::CSEL)
18990 return false;
18991 // Set the information about the operands.
18992 // TODO: we want the operands of the Cmp not the csel
18993 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
18994 SetCCInfo.IsAArch64 = true;
18995 SetCCInfo.Info.AArch64.CC =
18996 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
18997
18998 // Check that the operands matches the constraints:
18999 // (1) Both operands must be constants.
19000 // (2) One must be 1 and the other must be 0.
19001 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19002 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19003
19004 // Check (1).
19005 if (!TValue || !FValue)
19006 return false;
19007
19008 // Check (2).
19009 if (!TValue->isOne()) {
19010 // Update the comparison when we are interested in !cc.
19011 std::swap(TValue, FValue);
19012 SetCCInfo.Info.AArch64.CC =
19014 }
19015 return TValue->isOne() && FValue->isZero();
19016}
19017
19018// Returns true if Op is setcc or zext of setcc.
19019static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19020 if (isSetCC(Op, Info))
19021 return true;
19022 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19023 isSetCC(Op->getOperand(0), Info));
19024}
19025
19026// The folding we want to perform is:
19027// (add x, [zext] (setcc cc ...) )
19028// -->
19029// (csel x, (add x, 1), !cc ...)
19030//
19031// The latter will get matched to a CSINC instruction.
19033 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19034 SDValue LHS = Op->getOperand(0);
19035 SDValue RHS = Op->getOperand(1);
19036 SetCCInfoAndKind InfoAndKind;
19037
19038 // If both operands are a SET_CC, then we don't want to perform this
19039 // folding and create another csel as this results in more instructions
19040 // (and higher register usage).
19041 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19042 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19043 return SDValue();
19044
19045 // If neither operand is a SET_CC, give up.
19046 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19047 std::swap(LHS, RHS);
19048 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19049 return SDValue();
19050 }
19051
19052 // FIXME: This could be generatized to work for FP comparisons.
19053 EVT CmpVT = InfoAndKind.IsAArch64
19054 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19055 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19056 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19057 return SDValue();
19058
19059 SDValue CCVal;
19060 SDValue Cmp;
19061 SDLoc dl(Op);
19062 if (InfoAndKind.IsAArch64) {
19063 CCVal = DAG.getConstant(
19065 MVT::i32);
19066 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19067 } else
19068 Cmp = getAArch64Cmp(
19069 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19070 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19071 dl);
19072
19073 EVT VT = Op->getValueType(0);
19074 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19075 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19076}
19077
19078// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19080 EVT VT = N->getValueType(0);
19081 // Only scalar integer and vector types.
19082 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19083 return SDValue();
19084
19085 SDValue LHS = N->getOperand(0);
19086 SDValue RHS = N->getOperand(1);
19087 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19088 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19089 return SDValue();
19090
19091 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19092 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19093 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19094 return SDValue();
19095
19096 SDValue Op1 = LHS->getOperand(0);
19097 SDValue Op2 = RHS->getOperand(0);
19098 EVT OpVT1 = Op1.getValueType();
19099 EVT OpVT2 = Op2.getValueType();
19100 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19101 Op2.getOpcode() != AArch64ISD::UADDV ||
19102 OpVT1.getVectorElementType() != VT)
19103 return SDValue();
19104
19105 SDValue Val1 = Op1.getOperand(0);
19106 SDValue Val2 = Op2.getOperand(0);
19107 EVT ValVT = Val1->getValueType(0);
19108 SDLoc DL(N);
19109 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19110 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19111 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19112 DAG.getConstant(0, DL, MVT::i64));
19113}
19114
19115/// Perform the scalar expression combine in the form of:
19116/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19117/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19119 EVT VT = N->getValueType(0);
19120 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19121 return SDValue();
19122
19123 SDValue LHS = N->getOperand(0);
19124 SDValue RHS = N->getOperand(1);
19125
19126 // Handle commutivity.
19127 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19128 LHS.getOpcode() != AArch64ISD::CSNEG) {
19129 std::swap(LHS, RHS);
19130 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19131 LHS.getOpcode() != AArch64ISD::CSNEG) {
19132 return SDValue();
19133 }
19134 }
19135
19136 if (!LHS.hasOneUse())
19137 return SDValue();
19138
19139 AArch64CC::CondCode AArch64CC =
19140 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19141
19142 // The CSEL should include a const one operand, and the CSNEG should include
19143 // One or NegOne operand.
19144 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19145 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19146 if (!CTVal || !CFVal)
19147 return SDValue();
19148
19149 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19150 (CTVal->isOne() || CFVal->isOne())) &&
19151 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19152 (CTVal->isOne() || CFVal->isAllOnes())))
19153 return SDValue();
19154
19155 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19156 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19157 !CFVal->isOne()) {
19158 std::swap(CTVal, CFVal);
19159 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19160 }
19161
19162 SDLoc DL(N);
19163 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19164 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19165 !CFVal->isAllOnes()) {
19166 APInt C = -1 * CFVal->getAPIntValue();
19167 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19168 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19169 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19170 }
19171
19172 // It might be neutral for larger constants, as the immediate need to be
19173 // materialized in a register.
19174 APInt ADDC = CTVal->getAPIntValue();
19175 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19176 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19177 return SDValue();
19178
19179 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19180 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19181 "Unexpected constant value");
19182
19183 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19184 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19185 SDValue Cmp = LHS.getOperand(3);
19186
19187 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19188}
19189
19190// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19192 EVT VT = N->getValueType(0);
19193 if (N->getOpcode() != ISD::ADD)
19194 return SDValue();
19195
19196 SDValue Dot = N->getOperand(0);
19197 SDValue A = N->getOperand(1);
19198 // Handle commutivity
19199 auto isZeroDot = [](SDValue Dot) {
19200 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19201 Dot.getOpcode() == AArch64ISD::SDOT) &&
19203 };
19204 if (!isZeroDot(Dot))
19205 std::swap(Dot, A);
19206 if (!isZeroDot(Dot))
19207 return SDValue();
19208
19209 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19210 Dot.getOperand(2));
19211}
19212
19214 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19215}
19216
19218 SDLoc DL(Op);
19219 EVT VT = Op.getValueType();
19220 SDValue Zero = DAG.getConstant(0, DL, VT);
19221 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19222}
19223
19224// Try to fold
19225//
19226// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19227//
19228// The folding helps csel to be matched with csneg without generating
19229// redundant neg instruction, which includes negation of the csel expansion
19230// of abs node lowered by lowerABS.
19232 if (!isNegatedInteger(SDValue(N, 0)))
19233 return SDValue();
19234
19235 SDValue CSel = N->getOperand(1);
19236 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19237 return SDValue();
19238
19239 SDValue N0 = CSel.getOperand(0);
19240 SDValue N1 = CSel.getOperand(1);
19241
19242 // If both of them is not negations, it's not worth the folding as it
19243 // introduces two additional negations while reducing one negation.
19244 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19245 return SDValue();
19246
19247 SDValue N0N = getNegatedInteger(N0, DAG);
19248 SDValue N1N = getNegatedInteger(N1, DAG);
19249
19250 SDLoc DL(N);
19251 EVT VT = CSel.getValueType();
19252 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19253 CSel.getOperand(3));
19254}
19255
19256// The basic add/sub long vector instructions have variants with "2" on the end
19257// which act on the high-half of their inputs. They are normally matched by
19258// patterns like:
19259//
19260// (add (zeroext (extract_high LHS)),
19261// (zeroext (extract_high RHS)))
19262// -> uaddl2 vD, vN, vM
19263//
19264// However, if one of the extracts is something like a duplicate, this
19265// instruction can still be used profitably. This function puts the DAG into a
19266// more appropriate form for those patterns to trigger.
19269 SelectionDAG &DAG = DCI.DAG;
19270 if (DCI.isBeforeLegalizeOps())
19271 return SDValue();
19272
19273 MVT VT = N->getSimpleValueType(0);
19274 if (!VT.is128BitVector()) {
19275 if (N->getOpcode() == ISD::ADD)
19276 return performSetccAddFolding(N, DAG);
19277 return SDValue();
19278 }
19279
19280 // Make sure both branches are extended in the same way.
19281 SDValue LHS = N->getOperand(0);
19282 SDValue RHS = N->getOperand(1);
19283 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19284 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19285 LHS.getOpcode() != RHS.getOpcode())
19286 return SDValue();
19287
19288 unsigned ExtType = LHS.getOpcode();
19289
19290 // It's not worth doing if at least one of the inputs isn't already an
19291 // extract, but we don't know which it'll be so we have to try both.
19292 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19293 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19294 if (!RHS.getNode())
19295 return SDValue();
19296
19297 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19298 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19299 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19300 if (!LHS.getNode())
19301 return SDValue();
19302
19303 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19304 }
19305
19306 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19307}
19308
19309static bool isCMP(SDValue Op) {
19310 return Op.getOpcode() == AArch64ISD::SUBS &&
19311 !Op.getNode()->hasAnyUseOfValue(0);
19312}
19313
19314// (CSEL 1 0 CC Cond) => CC
19315// (CSEL 0 1 CC Cond) => !CC
19316static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19317 if (Op.getOpcode() != AArch64ISD::CSEL)
19318 return std::nullopt;
19319 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19320 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19321 return std::nullopt;
19322 SDValue OpLHS = Op.getOperand(0);
19323 SDValue OpRHS = Op.getOperand(1);
19324 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19325 return CC;
19326 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19327 return getInvertedCondCode(CC);
19328
19329 return std::nullopt;
19330}
19331
19332// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19333// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19334static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19335 SDValue CmpOp = Op->getOperand(2);
19336 if (!isCMP(CmpOp))
19337 return SDValue();
19338
19339 if (IsAdd) {
19340 if (!isOneConstant(CmpOp.getOperand(1)))
19341 return SDValue();
19342 } else {
19343 if (!isNullConstant(CmpOp.getOperand(0)))
19344 return SDValue();
19345 }
19346
19347 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19348 auto CC = getCSETCondCode(CsetOp);
19349 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19350 return SDValue();
19351
19352 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19353 Op->getOperand(0), Op->getOperand(1),
19354 CsetOp.getOperand(3));
19355}
19356
19357// (ADC x 0 cond) => (CINC x HS cond)
19359 SDValue LHS = N->getOperand(0);
19360 SDValue RHS = N->getOperand(1);
19361 SDValue Cond = N->getOperand(2);
19362
19363 if (!isNullConstant(RHS))
19364 return SDValue();
19365
19366 EVT VT = N->getValueType(0);
19367 SDLoc DL(N);
19368
19369 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19370 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19371 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19372}
19373
19374// Transform vector add(zext i8 to i32, zext i8 to i32)
19375// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19376// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19377// extends.
19379 EVT VT = N->getValueType(0);
19380 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19381 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19382 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19383 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19384 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19385 N->getOperand(0).getOperand(0).getValueType() !=
19386 N->getOperand(1).getOperand(0).getValueType())
19387 return SDValue();
19388
19389 SDValue N0 = N->getOperand(0).getOperand(0);
19390 SDValue N1 = N->getOperand(1).getOperand(0);
19391 EVT InVT = N0.getValueType();
19392
19393 EVT S1 = InVT.getScalarType();
19394 EVT S2 = VT.getScalarType();
19395 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19396 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19397 SDLoc DL(N);
19398 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19401 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19402 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19403 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19404 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19405 }
19406 return SDValue();
19407}
19408
19411 SelectionDAG &DAG) {
19412 SDLoc DL(N);
19413 EVT VT = N->getValueType(0);
19414
19415 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19416 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19417 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19418 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19419 Elt1->getOpcode() == ISD::FP_ROUND &&
19420 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19421 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19422 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19424 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19425 // Constant index.
19426 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19427 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19428 Elt0->getOperand(0)->getOperand(0) ==
19429 Elt1->getOperand(0)->getOperand(0) &&
19430 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19431 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19432 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19433 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19434 SDValue HighLanes;
19435 if (Elt2->getOpcode() == ISD::UNDEF &&
19436 Elt3->getOpcode() == ISD::UNDEF) {
19437 HighLanes = DAG.getUNDEF(MVT::v2f32);
19438 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19439 Elt3->getOpcode() == ISD::FP_ROUND &&
19440 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19441 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19442 Elt2->getConstantOperandVal(1) ==
19443 Elt3->getConstantOperandVal(1) &&
19444 Elt2->getOperand(0)->getOpcode() ==
19446 Elt3->getOperand(0)->getOpcode() ==
19448 // Constant index.
19449 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19450 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19451 Elt2->getOperand(0)->getOperand(0) ==
19452 Elt3->getOperand(0)->getOperand(0) &&
19453 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19454 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19455 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19456 HighLanes =
19457 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19458 }
19459 if (HighLanes) {
19460 SDValue DoubleToSingleSticky =
19461 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19462 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19463 DoubleToSingleSticky, HighLanes);
19464 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19465 Elt0->getOperand(1));
19466 }
19467 }
19468 }
19469 }
19470
19471 if (VT == MVT::v2f64) {
19472 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19473 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19474 Elt1->getOpcode() == ISD::FP_EXTEND &&
19476 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19477 Elt0->getOperand(0)->getOperand(0) ==
19478 Elt1->getOperand(0)->getOperand(0) &&
19479 // Constant index.
19480 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19481 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19482 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19483 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19484 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19485 // ResultType's known minimum vector length.
19486 Elt0->getOperand(0)->getConstantOperandVal(1) %
19488 0) {
19489 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19490 if (SrcVec.getValueType() == MVT::v4f16 ||
19491 SrcVec.getValueType() == MVT::v4bf16) {
19492 SDValue HalfToSingle =
19493 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19494 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19495 SDValue Extract = DAG.getNode(
19497 HalfToSingle, SubvectorIdx);
19498 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19499 }
19500 }
19501 }
19502
19503 // A build vector of two extracted elements is equivalent to an
19504 // extract subvector where the inner vector is any-extended to the
19505 // extract_vector_elt VT.
19506 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19507 // (extract_elt_iXX_to_i32 vec Idx+1))
19508 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19509
19510 // For now, only consider the v2i32 case, which arises as a result of
19511 // legalization.
19512 if (VT != MVT::v2i32)
19513 return SDValue();
19514
19515 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19516 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19517 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19518 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19519 // Constant index.
19520 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19521 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19522 // Both EXTRACT_VECTOR_ELT from same vector...
19523 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19524 // ... and contiguous. First element's index +1 == second element's index.
19525 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19526 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19527 // ResultType's known minimum vector length.
19528 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19529 SDValue VecToExtend = Elt0->getOperand(0);
19530 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19531 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19532 return SDValue();
19533
19534 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19535
19536 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19537 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19538 SubvectorIdx);
19539 }
19540
19541 return SDValue();
19542}
19543
19545 SelectionDAG &DAG) {
19546 EVT VT = N->getValueType(0);
19547 SDValue N0 = N->getOperand(0);
19548 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19549 N0.getOpcode() == AArch64ISD::DUP) {
19550 SDValue Op = N0.getOperand(0);
19551 if (VT.getScalarType() == MVT::i32 &&
19552 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19553 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19554 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19555 }
19556
19557 return SDValue();
19558}
19559
19560// Check an node is an extend or shift operand
19562 unsigned Opcode = N.getOpcode();
19563 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19564 EVT SrcVT;
19565 if (Opcode == ISD::SIGN_EXTEND_INREG)
19566 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19567 else
19568 SrcVT = N.getOperand(0).getValueType();
19569
19570 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19571 } else if (Opcode == ISD::AND) {
19572 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19573 if (!CSD)
19574 return false;
19575 uint64_t AndMask = CSD->getZExtValue();
19576 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19577 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19578 return isa<ConstantSDNode>(N.getOperand(1));
19579 }
19580
19581 return false;
19582}
19583
19584// (N - Y) + Z --> (Z - Y) + N
19585// when N is an extend or shift operand
19587 SelectionDAG &DAG) {
19588 auto IsOneUseExtend = [](SDValue N) {
19589 return N.hasOneUse() && isExtendOrShiftOperand(N);
19590 };
19591
19592 // DAGCombiner will revert the combination when Z is constant cause
19593 // dead loop. So don't enable the combination when Z is constant.
19594 // If Z is one use shift C, we also can't do the optimization.
19595 // It will falling to self infinite loop.
19596 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19597 return SDValue();
19598
19599 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19600 return SDValue();
19601
19602 SDValue Shift = SUB.getOperand(0);
19603 if (!IsOneUseExtend(Shift))
19604 return SDValue();
19605
19606 SDLoc DL(N);
19607 EVT VT = N->getValueType(0);
19608
19609 SDValue Y = SUB.getOperand(1);
19610 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19611 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19612}
19613
19615 SelectionDAG &DAG) {
19616 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19617 // commutative.
19618 if (N->getOpcode() != ISD::ADD)
19619 return SDValue();
19620
19621 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19622 // shifted register is only available for i32 and i64.
19623 EVT VT = N->getValueType(0);
19624 if (VT != MVT::i32 && VT != MVT::i64)
19625 return SDValue();
19626
19627 SDLoc DL(N);
19628 SDValue LHS = N->getOperand(0);
19629 SDValue RHS = N->getOperand(1);
19630
19631 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19632 return Val;
19633 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19634 return Val;
19635
19636 uint64_t LHSImm = 0, RHSImm = 0;
19637 // If both operand are shifted by imm and shift amount is not greater than 4
19638 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19639 // on RHS.
19640 //
19641 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19642 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19643 // with LSL (shift > 4). For the rest of processors, this is no-op for
19644 // performance or correctness.
19645 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19646 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19647 RHSImm > 4 && LHS.hasOneUse())
19648 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19649
19650 return SDValue();
19651}
19652
19653// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19654// This reassociates it back to allow the creation of more mls instructions.
19656 if (N->getOpcode() != ISD::SUB)
19657 return SDValue();
19658
19659 SDValue Add = N->getOperand(1);
19660 SDValue X = N->getOperand(0);
19661 if (Add.getOpcode() != ISD::ADD)
19662 return SDValue();
19663
19664 if (!Add.hasOneUse())
19665 return SDValue();
19667 return SDValue();
19668
19669 SDValue M1 = Add.getOperand(0);
19670 SDValue M2 = Add.getOperand(1);
19671 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19672 M1.getOpcode() != AArch64ISD::UMULL)
19673 return SDValue();
19674 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19676 return SDValue();
19677
19678 EVT VT = N->getValueType(0);
19679 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19680 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19681}
19682
19683// Combine into mla/mls.
19684// This works on the patterns of:
19685// add v1, (mul v2, v3)
19686// sub v1, (mul v2, v3)
19687// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19688// It will transform the add/sub to a scalable version, so that we can
19689// make use of SVE's MLA/MLS that will be generated for that pattern
19690static SDValue
19692 SelectionDAG &DAG = DCI.DAG;
19693 // Make sure that the types are legal
19694 if (!DCI.isAfterLegalizeDAG())
19695 return SDValue();
19696 // Before using SVE's features, check first if it's available.
19697 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19698 return SDValue();
19699
19700 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19701 return SDValue();
19702
19703 if (!N->getValueType(0).isFixedLengthVector())
19704 return SDValue();
19705
19706 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19707 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19708 return SDValue();
19709
19710 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19711 return SDValue();
19712
19713 SDValue MulValue = Op1->getOperand(0);
19714 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19715 return SDValue();
19716
19717 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19718 return SDValue();
19719
19720 EVT ScalableVT = MulValue.getValueType();
19721 if (!ScalableVT.isScalableVector())
19722 return SDValue();
19723
19724 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19725 SDValue NewValue =
19726 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19727 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19728 };
19729
19730 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19731 return res;
19732 else if (N->getOpcode() == ISD::ADD)
19733 return performOpt(N->getOperand(1), N->getOperand(0));
19734
19735 return SDValue();
19736}
19737
19738// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19739// help, for example, to produce ssra from sshr+add.
19741 EVT VT = N->getValueType(0);
19742 if (VT != MVT::i64)
19743 return SDValue();
19744 SDValue Op0 = N->getOperand(0);
19745 SDValue Op1 = N->getOperand(1);
19746
19747 // At least one of the operands should be an extract, and the other should be
19748 // something that is easy to convert to v1i64 type (in this case a load).
19749 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19750 Op0.getOpcode() != ISD::LOAD)
19751 return SDValue();
19752 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19753 Op1.getOpcode() != ISD::LOAD)
19754 return SDValue();
19755
19756 SDLoc DL(N);
19757 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19758 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19759 Op0 = Op0.getOperand(0);
19760 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19761 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19762 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19763 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19764 Op1 = Op1.getOperand(0);
19765 } else
19766 return SDValue();
19767
19768 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19769 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19770 DAG.getConstant(0, DL, MVT::i64));
19771}
19772
19775 if (!BV->hasOneUse())
19776 return false;
19777 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19778 if (!Ld || !Ld->isSimple())
19779 return false;
19780 Loads.push_back(Ld);
19781 return true;
19782 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19784 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19785 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19786 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19787 return false;
19788 Loads.push_back(Ld);
19789 }
19790 return true;
19791 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19792 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19793 // are lowered. Note that this only comes up because we do not always visit
19794 // operands before uses. After that is fixed this can be removed and in the
19795 // meantime this is fairly specific to the lowering we expect from IR.
19796 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19797 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19798 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19799 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19800 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19801 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19802 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19803 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19804 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19805 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19806 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19807 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19808 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19809 B.getOperand(1).getNumOperands() != 4)
19810 return false;
19811 auto SV1 = cast<ShuffleVectorSDNode>(B);
19812 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19813 int NumElts = B.getValueType().getVectorNumElements();
19814 int NumSubElts = NumElts / 4;
19815 for (int I = 0; I < NumSubElts; I++) {
19816 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19817 if (SV1->getMaskElt(I) != I ||
19818 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19819 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19820 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19821 return false;
19822 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19823 if (SV2->getMaskElt(I) != I ||
19824 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19825 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19826 return false;
19827 }
19828 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19829 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19830 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19831 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19832 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19833 !Ld2->isSimple() || !Ld3->isSimple())
19834 return false;
19835 Loads.push_back(Ld0);
19836 Loads.push_back(Ld1);
19837 Loads.push_back(Ld2);
19838 Loads.push_back(Ld3);
19839 return true;
19840 }
19841 return false;
19842}
19843
19845 SelectionDAG &DAG,
19846 unsigned &NumSubLoads) {
19847 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19848 return false;
19849
19850 SmallVector<LoadSDNode *> Loads0, Loads1;
19851 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19852 isLoadOrMultipleLoads(Op1, Loads1)) {
19853 if (NumSubLoads && Loads0.size() != NumSubLoads)
19854 return false;
19855 NumSubLoads = Loads0.size();
19856 return Loads0.size() == Loads1.size() &&
19857 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19858 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19859 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19860 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19861 Size / 8, 1);
19862 });
19863 }
19864
19865 if (Op0.getOpcode() != Op1.getOpcode())
19866 return false;
19867
19868 switch (Op0.getOpcode()) {
19869 case ISD::ADD:
19870 case ISD::SUB:
19872 DAG, NumSubLoads) &&
19874 DAG, NumSubLoads);
19875 case ISD::SIGN_EXTEND:
19876 case ISD::ANY_EXTEND:
19877 case ISD::ZERO_EXTEND:
19878 EVT XVT = Op0.getOperand(0).getValueType();
19879 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19880 XVT.getScalarSizeInBits() != 32)
19881 return false;
19883 DAG, NumSubLoads);
19884 }
19885 return false;
19886}
19887
19888// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19889// into a single load of twice the size, that we extract the bottom part and top
19890// part so that the shl can use a shll2 instruction. The two loads in that
19891// example can also be larger trees of instructions, which are identical except
19892// for the leaves which are all loads offset from the LHS, including
19893// buildvectors of multiple loads. For example the RHS tree could be
19894// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19895// Whilst it can be common for the larger loads to replace LDP instructions
19896// (which doesn't gain anything on it's own), the larger loads can help create
19897// more efficient code, and in buildvectors prevent the need for ld1 lane
19898// inserts which can be slower than normal loads.
19900 EVT VT = N->getValueType(0);
19901 if (!VT.isFixedLengthVector() ||
19902 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19903 VT.getScalarSizeInBits() != 64))
19904 return SDValue();
19905
19906 SDValue Other = N->getOperand(0);
19907 SDValue Shift = N->getOperand(1);
19908 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19909 std::swap(Shift, Other);
19910 APInt ShiftAmt;
19911 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19912 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
19913 return SDValue();
19914
19915 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
19916 !ISD::isExtOpcode(Other.getOpcode()) ||
19917 Shift.getOperand(0).getOperand(0).getValueType() !=
19918 Other.getOperand(0).getValueType() ||
19919 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
19920 return SDValue();
19921
19922 SDValue Op0 = Other.getOperand(0);
19923 SDValue Op1 = Shift.getOperand(0).getOperand(0);
19924
19925 unsigned NumSubLoads = 0;
19926 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19927 return SDValue();
19928
19929 // Attempt to rule out some unprofitable cases using heuristics (some working
19930 // around suboptimal code generation), notably if the extend not be able to
19931 // use ushll2 instructions as the types are not large enough. Otherwise zip's
19932 // will need to be created which can increase the instruction count.
19933 unsigned NumElts = Op0.getValueType().getVectorNumElements();
19934 unsigned NumSubElts = NumElts / NumSubLoads;
19935 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
19936 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
19937 Op0.getValueType().getSizeInBits() < 128 &&
19939 return SDValue();
19940
19941 // Recreate the tree with the new combined loads.
19942 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
19943 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
19944 EVT DVT =
19946
19947 SmallVector<LoadSDNode *> Loads0, Loads1;
19948 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19949 isLoadOrMultipleLoads(Op1, Loads1)) {
19950 EVT LoadVT = EVT::getVectorVT(
19951 *DAG.getContext(), Op0.getValueType().getScalarType(),
19952 Op0.getValueType().getVectorNumElements() / Loads0.size());
19953 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19954
19955 SmallVector<SDValue> NewLoads;
19956 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
19957 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
19958 L0->getBasePtr(), L0->getPointerInfo(),
19959 L0->getOriginalAlign());
19960 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
19961 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
19962 NewLoads.push_back(Load);
19963 }
19964 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
19965 }
19966
19968 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
19969 Ops.push_back(GenCombinedTree(O0, O1, DAG));
19970 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
19971 };
19972 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
19973
19974 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
19975 int Hi = NumSubElts, Lo = 0;
19976 for (unsigned i = 0; i < NumSubLoads; i++) {
19977 for (unsigned j = 0; j < NumSubElts; j++) {
19978 LowMask[i * NumSubElts + j] = Lo++;
19979 HighMask[i * NumSubElts + j] = Hi++;
19980 }
19981 Lo += NumSubElts;
19982 Hi += NumSubElts;
19983 }
19984 SDLoc DL(N);
19985 SDValue Ext0, Ext1;
19986 // Extract the top and bottom lanes, then extend the result. Possibly extend
19987 // the result then extract the lanes if the two operands match as it produces
19988 // slightly smaller code.
19989 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
19991 NewOp, DAG.getConstant(0, DL, MVT::i64));
19992 SDValue SubH =
19993 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
19994 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19995 SDValue Extr0 =
19996 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
19997 SDValue Extr1 =
19998 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
19999 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20000 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20001 } else {
20003 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20004 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20005 DAG.getConstant(0, DL, MVT::i64));
20006 SDValue SubH =
20007 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20008 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20009 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20010 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20011 }
20012 SDValue NShift =
20013 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20014 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20015}
20016
20019 // Try to change sum of two reductions.
20020 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20021 return Val;
20022 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20023 return Val;
20024 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20025 return Val;
20026 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20027 return Val;
20029 return Val;
20031 return Val;
20032 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20033 return Val;
20034 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20035 return Val;
20036 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20037 return Val;
20038
20039 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20040 return Val;
20041
20042 return performAddSubLongCombine(N, DCI);
20043}
20044
20045// Massage DAGs which we can use the high-half "long" operations on into
20046// something isel will recognize better. E.g.
20047//
20048// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20049// (aarch64_neon_umull (extract_high (v2i64 vec)))
20050// (extract_high (v2i64 (dup128 scalar)))))
20051//
20054 SelectionDAG &DAG) {
20055 if (DCI.isBeforeLegalizeOps())
20056 return SDValue();
20057
20058 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20059 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20060 assert(LHS.getValueType().is64BitVector() &&
20061 RHS.getValueType().is64BitVector() &&
20062 "unexpected shape for long operation");
20063
20064 // Either node could be a DUP, but it's not worth doing both of them (you'd
20065 // just as well use the non-high version) so look for a corresponding extract
20066 // operation on the other "wing".
20069 if (!RHS.getNode())
20070 return SDValue();
20073 if (!LHS.getNode())
20074 return SDValue();
20075 } else
20076 return SDValue();
20077
20078 if (IID == Intrinsic::not_intrinsic)
20079 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20080
20081 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20082 N->getOperand(0), LHS, RHS);
20083}
20084
20085static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20086 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20087 unsigned ElemBits = ElemTy.getSizeInBits();
20088
20089 int64_t ShiftAmount;
20090 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20091 APInt SplatValue, SplatUndef;
20092 unsigned SplatBitSize;
20093 bool HasAnyUndefs;
20094 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20095 HasAnyUndefs, ElemBits) ||
20096 SplatBitSize != ElemBits)
20097 return SDValue();
20098
20099 ShiftAmount = SplatValue.getSExtValue();
20100 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20101 ShiftAmount = CVN->getSExtValue();
20102 } else
20103 return SDValue();
20104
20105 // If the shift amount is zero, remove the shift intrinsic.
20106 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20107 return N->getOperand(1);
20108
20109 unsigned Opcode;
20110 bool IsRightShift;
20111 switch (IID) {
20112 default:
20113 llvm_unreachable("Unknown shift intrinsic");
20114 case Intrinsic::aarch64_neon_sqshl:
20115 Opcode = AArch64ISD::SQSHL_I;
20116 IsRightShift = false;
20117 break;
20118 case Intrinsic::aarch64_neon_uqshl:
20119 Opcode = AArch64ISD::UQSHL_I;
20120 IsRightShift = false;
20121 break;
20122 case Intrinsic::aarch64_neon_srshl:
20123 Opcode = AArch64ISD::SRSHR_I;
20124 IsRightShift = true;
20125 break;
20126 case Intrinsic::aarch64_neon_urshl:
20127 Opcode = AArch64ISD::URSHR_I;
20128 IsRightShift = true;
20129 break;
20130 case Intrinsic::aarch64_neon_sqshlu:
20131 Opcode = AArch64ISD::SQSHLU_I;
20132 IsRightShift = false;
20133 break;
20134 case Intrinsic::aarch64_neon_sshl:
20135 case Intrinsic::aarch64_neon_ushl:
20136 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20137 // left shift for positive shift amounts. For negative shifts we can use a
20138 // VASHR/VLSHR as appropiate.
20139 if (ShiftAmount < 0) {
20140 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20142 ShiftAmount = -ShiftAmount;
20143 } else
20144 Opcode = AArch64ISD::VSHL;
20145 IsRightShift = false;
20146 break;
20147 }
20148
20149 EVT VT = N->getValueType(0);
20150 SDValue Op = N->getOperand(1);
20151 SDLoc dl(N);
20152 if (VT == MVT::i64) {
20153 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20154 VT = MVT::v1i64;
20155 }
20156
20157 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20158 Op = DAG.getNode(Opcode, dl, VT, Op,
20159 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20160 if (N->getValueType(0) == MVT::i64)
20161 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20162 DAG.getConstant(0, dl, MVT::i64));
20163 return Op;
20164 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20165 Op = DAG.getNode(Opcode, dl, VT, Op,
20166 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20167 if (N->getValueType(0) == MVT::i64)
20168 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20169 DAG.getConstant(0, dl, MVT::i64));
20170 return Op;
20171 }
20172
20173 return SDValue();
20174}
20175
20176// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20177// the intrinsics must be legal and take an i32, this means there's almost
20178// certainly going to be a zext in the DAG which we can eliminate.
20179static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20180 SDValue AndN = N->getOperand(2);
20181 if (AndN.getOpcode() != ISD::AND)
20182 return SDValue();
20183
20184 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20185 if (!CMask || CMask->getZExtValue() != Mask)
20186 return SDValue();
20187
20188 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20189 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20190}
20191
20193 SelectionDAG &DAG) {
20194 SDLoc dl(N);
20195 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20196 DAG.getNode(Opc, dl,
20197 N->getOperand(1).getSimpleValueType(),
20198 N->getOperand(1)),
20199 DAG.getConstant(0, dl, MVT::i64));
20200}
20201
20203 SDLoc DL(N);
20204 SDValue Op1 = N->getOperand(1);
20205 SDValue Op2 = N->getOperand(2);
20206 EVT ScalarTy = Op2.getValueType();
20207 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20208 ScalarTy = MVT::i32;
20209
20210 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20211 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20212 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20213 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20214 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20215 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20216}
20217
20219 SDLoc dl(N);
20220 SDValue Scalar = N->getOperand(3);
20221 EVT ScalarTy = Scalar.getValueType();
20222
20223 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20224 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20225
20226 SDValue Passthru = N->getOperand(1);
20227 SDValue Pred = N->getOperand(2);
20228 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20229 Pred, Scalar, Passthru);
20230}
20231
20233 SDLoc dl(N);
20234 LLVMContext &Ctx = *DAG.getContext();
20235 EVT VT = N->getValueType(0);
20236
20237 assert(VT.isScalableVector() && "Expected a scalable vector.");
20238
20239 // Current lowering only supports the SVE-ACLE types.
20241 return SDValue();
20242
20243 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20244 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20245 EVT ByteVT =
20246 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20247
20248 // Convert everything to the domain of EXT (i.e bytes).
20249 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20250 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20251 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20252 DAG.getConstant(ElemSize, dl, MVT::i32));
20253
20254 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20255 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20256}
20257
20260 SelectionDAG &DAG) {
20261 if (DCI.isBeforeLegalize())
20262 return SDValue();
20263
20264 SDValue Comparator = N->getOperand(3);
20265 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20266 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20267 unsigned IID = getIntrinsicID(N);
20268 EVT VT = N->getValueType(0);
20269 EVT CmpVT = N->getOperand(2).getValueType();
20270 SDValue Pred = N->getOperand(1);
20271 SDValue Imm;
20272 SDLoc DL(N);
20273
20274 switch (IID) {
20275 default:
20276 llvm_unreachable("Called with wrong intrinsic!");
20277 break;
20278
20279 // Signed comparisons
20280 case Intrinsic::aarch64_sve_cmpeq_wide:
20281 case Intrinsic::aarch64_sve_cmpne_wide:
20282 case Intrinsic::aarch64_sve_cmpge_wide:
20283 case Intrinsic::aarch64_sve_cmpgt_wide:
20284 case Intrinsic::aarch64_sve_cmplt_wide:
20285 case Intrinsic::aarch64_sve_cmple_wide: {
20286 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20287 int64_t ImmVal = CN->getSExtValue();
20288 if (ImmVal >= -16 && ImmVal <= 15)
20289 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20290 else
20291 return SDValue();
20292 }
20293 break;
20294 }
20295 // Unsigned comparisons
20296 case Intrinsic::aarch64_sve_cmphs_wide:
20297 case Intrinsic::aarch64_sve_cmphi_wide:
20298 case Intrinsic::aarch64_sve_cmplo_wide:
20299 case Intrinsic::aarch64_sve_cmpls_wide: {
20300 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20301 uint64_t ImmVal = CN->getZExtValue();
20302 if (ImmVal <= 127)
20303 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20304 else
20305 return SDValue();
20306 }
20307 break;
20308 }
20309 }
20310
20311 if (!Imm)
20312 return SDValue();
20313
20314 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20315 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20316 N->getOperand(2), Splat, DAG.getCondCode(CC));
20317 }
20318
20319 return SDValue();
20320}
20321
20324 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20325
20326 SDLoc DL(Op);
20327 assert(Op.getValueType().isScalableVector() &&
20328 TLI.isTypeLegal(Op.getValueType()) &&
20329 "Expected legal scalable vector type!");
20330 assert(Op.getValueType() == Pg.getValueType() &&
20331 "Expected same type for PTEST operands");
20332
20333 // Ensure target specific opcodes are using legal type.
20334 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20335 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20336 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20337
20338 // Ensure operands have type nxv16i1.
20339 if (Op.getValueType() != MVT::nxv16i1) {
20342 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20343 else
20344 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20345 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20346 }
20347
20348 // Set condition code (CC) flags.
20349 SDValue Test = DAG.getNode(
20351 DL, MVT::Other, Pg, Op);
20352
20353 // Convert CC to integer based on requested condition.
20354 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20355 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20356 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20357 return DAG.getZExtOrTrunc(Res, DL, VT);
20358}
20359
20361 SelectionDAG &DAG) {
20362 SDLoc DL(N);
20363
20364 SDValue Pred = N->getOperand(1);
20365 SDValue VecToReduce = N->getOperand(2);
20366
20367 // NOTE: The integer reduction's result type is not always linked to the
20368 // operand's element type so we construct it from the intrinsic's result type.
20369 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20370 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20371
20372 // SVE reductions set the whole vector register with the first element
20373 // containing the reduction result, which we'll now extract.
20374 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20375 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20376 Zero);
20377}
20378
20380 SelectionDAG &DAG) {
20381 SDLoc DL(N);
20382
20383 SDValue Pred = N->getOperand(1);
20384 SDValue VecToReduce = N->getOperand(2);
20385
20386 EVT ReduceVT = VecToReduce.getValueType();
20387 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20388
20389 // SVE reductions set the whole vector register with the first element
20390 // containing the reduction result, which we'll now extract.
20391 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20392 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20393 Zero);
20394}
20395
20397 SelectionDAG &DAG) {
20398 SDLoc DL(N);
20399
20400 SDValue Pred = N->getOperand(1);
20401 SDValue InitVal = N->getOperand(2);
20402 SDValue VecToReduce = N->getOperand(3);
20403 EVT ReduceVT = VecToReduce.getValueType();
20404
20405 // Ordered reductions use the first lane of the result vector as the
20406 // reduction's initial value.
20407 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20408 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20409 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20410
20411 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20412
20413 // SVE reductions set the whole vector register with the first element
20414 // containing the reduction result, which we'll now extract.
20415 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20416 Zero);
20417}
20418
20419// If a merged operation has no inactive lanes we can relax it to a predicated
20420// or unpredicated operation, which potentially allows better isel (perhaps
20421// using immediate forms) or relaxing register reuse requirements.
20423 SelectionDAG &DAG, bool UnpredOp = false,
20424 bool SwapOperands = false) {
20425 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20426 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20427 SDValue Pg = N->getOperand(1);
20428 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20429 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20430
20431 // ISD way to specify an all active predicate.
20432 if (isAllActivePredicate(DAG, Pg)) {
20433 if (UnpredOp)
20434 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20435
20436 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20437 }
20438
20439 // FUTURE: SplatVector(true)
20440 return SDValue();
20441}
20442
20445 const AArch64Subtarget *Subtarget) {
20446 SelectionDAG &DAG = DCI.DAG;
20447 unsigned IID = getIntrinsicID(N);
20448 switch (IID) {
20449 default:
20450 break;
20451 case Intrinsic::get_active_lane_mask: {
20452 SDValue Res = SDValue();
20453 EVT VT = N->getValueType(0);
20454 if (VT.isFixedLengthVector()) {
20455 // We can use the SVE whilelo instruction to lower this intrinsic by
20456 // creating the appropriate sequence of scalable vector operations and
20457 // then extracting a fixed-width subvector from the scalable vector.
20458
20459 SDLoc DL(N);
20460 SDValue ID =
20461 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20462
20463 EVT WhileVT = EVT::getVectorVT(
20464 *DAG.getContext(), MVT::i1,
20466
20467 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20468 EVT PromVT = getPromotedVTForPredicate(WhileVT);
20469
20470 // Get the fixed-width equivalent of PromVT for extraction.
20471 EVT ExtVT =
20474
20475 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
20476 N->getOperand(1), N->getOperand(2));
20477 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
20478 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20479 DAG.getConstant(0, DL, MVT::i64));
20480 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
20481 }
20482 return Res;
20483 }
20484 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20485 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20486 return tryCombineFixedPointConvert(N, DCI, DAG);
20487 case Intrinsic::aarch64_neon_saddv:
20489 case Intrinsic::aarch64_neon_uaddv:
20491 case Intrinsic::aarch64_neon_sminv:
20493 case Intrinsic::aarch64_neon_uminv:
20495 case Intrinsic::aarch64_neon_smaxv:
20497 case Intrinsic::aarch64_neon_umaxv:
20499 case Intrinsic::aarch64_neon_fmax:
20500 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20501 N->getOperand(1), N->getOperand(2));
20502 case Intrinsic::aarch64_neon_fmin:
20503 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20504 N->getOperand(1), N->getOperand(2));
20505 case Intrinsic::aarch64_neon_fmaxnm:
20506 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20507 N->getOperand(1), N->getOperand(2));
20508 case Intrinsic::aarch64_neon_fminnm:
20509 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20510 N->getOperand(1), N->getOperand(2));
20511 case Intrinsic::aarch64_neon_smull:
20512 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20513 N->getOperand(1), N->getOperand(2));
20514 case Intrinsic::aarch64_neon_umull:
20515 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20516 N->getOperand(1), N->getOperand(2));
20517 case Intrinsic::aarch64_neon_pmull:
20518 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20519 N->getOperand(1), N->getOperand(2));
20520 case Intrinsic::aarch64_neon_sqdmull:
20521 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20522 case Intrinsic::aarch64_neon_sqshl:
20523 case Intrinsic::aarch64_neon_uqshl:
20524 case Intrinsic::aarch64_neon_sqshlu:
20525 case Intrinsic::aarch64_neon_srshl:
20526 case Intrinsic::aarch64_neon_urshl:
20527 case Intrinsic::aarch64_neon_sshl:
20528 case Intrinsic::aarch64_neon_ushl:
20529 return tryCombineShiftImm(IID, N, DAG);
20530 case Intrinsic::aarch64_neon_sabd:
20531 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20532 N->getOperand(1), N->getOperand(2));
20533 case Intrinsic::aarch64_neon_uabd:
20534 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20535 N->getOperand(1), N->getOperand(2));
20536 case Intrinsic::aarch64_crc32b:
20537 case Intrinsic::aarch64_crc32cb:
20538 return tryCombineCRC32(0xff, N, DAG);
20539 case Intrinsic::aarch64_crc32h:
20540 case Intrinsic::aarch64_crc32ch:
20541 return tryCombineCRC32(0xffff, N, DAG);
20542 case Intrinsic::aarch64_sve_saddv:
20543 // There is no i64 version of SADDV because the sign is irrelevant.
20544 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20546 else
20548 case Intrinsic::aarch64_sve_uaddv:
20550 case Intrinsic::aarch64_sve_smaxv:
20552 case Intrinsic::aarch64_sve_umaxv:
20554 case Intrinsic::aarch64_sve_sminv:
20556 case Intrinsic::aarch64_sve_uminv:
20558 case Intrinsic::aarch64_sve_orv:
20560 case Intrinsic::aarch64_sve_eorv:
20562 case Intrinsic::aarch64_sve_andv:
20564 case Intrinsic::aarch64_sve_index:
20565 return LowerSVEIntrinsicIndex(N, DAG);
20566 case Intrinsic::aarch64_sve_dup:
20567 return LowerSVEIntrinsicDUP(N, DAG);
20568 case Intrinsic::aarch64_sve_dup_x:
20569 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20570 N->getOperand(1));
20571 case Intrinsic::aarch64_sve_ext:
20572 return LowerSVEIntrinsicEXT(N, DAG);
20573 case Intrinsic::aarch64_sve_mul_u:
20574 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20575 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20576 case Intrinsic::aarch64_sve_smulh_u:
20577 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20578 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20579 case Intrinsic::aarch64_sve_umulh_u:
20580 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20581 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20582 case Intrinsic::aarch64_sve_smin_u:
20583 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20584 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20585 case Intrinsic::aarch64_sve_umin_u:
20586 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20587 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20588 case Intrinsic::aarch64_sve_smax_u:
20589 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20590 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20591 case Intrinsic::aarch64_sve_umax_u:
20592 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20593 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20594 case Intrinsic::aarch64_sve_lsl_u:
20595 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20596 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20597 case Intrinsic::aarch64_sve_lsr_u:
20598 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20599 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20600 case Intrinsic::aarch64_sve_asr_u:
20601 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20602 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20603 case Intrinsic::aarch64_sve_fadd_u:
20604 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20605 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20606 case Intrinsic::aarch64_sve_fdiv_u:
20607 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20608 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20609 case Intrinsic::aarch64_sve_fmax_u:
20610 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20611 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20612 case Intrinsic::aarch64_sve_fmaxnm_u:
20613 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20614 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20615 case Intrinsic::aarch64_sve_fmla_u:
20616 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20617 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20618 N->getOperand(2));
20619 case Intrinsic::aarch64_sve_fmin_u:
20620 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20621 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20622 case Intrinsic::aarch64_sve_fminnm_u:
20623 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20624 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20625 case Intrinsic::aarch64_sve_fmul_u:
20626 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20627 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20628 case Intrinsic::aarch64_sve_fsub_u:
20629 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20630 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20631 case Intrinsic::aarch64_sve_add_u:
20632 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20633 N->getOperand(3));
20634 case Intrinsic::aarch64_sve_sub_u:
20635 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20636 N->getOperand(3));
20637 case Intrinsic::aarch64_sve_subr:
20638 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20639 case Intrinsic::aarch64_sve_and_u:
20640 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20641 N->getOperand(3));
20642 case Intrinsic::aarch64_sve_bic_u:
20643 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20644 N->getOperand(2), N->getOperand(3));
20645 case Intrinsic::aarch64_sve_eor_u:
20646 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20647 N->getOperand(3));
20648 case Intrinsic::aarch64_sve_orr_u:
20649 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20650 N->getOperand(3));
20651 case Intrinsic::aarch64_sve_sabd_u:
20652 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20653 N->getOperand(2), N->getOperand(3));
20654 case Intrinsic::aarch64_sve_uabd_u:
20655 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20656 N->getOperand(2), N->getOperand(3));
20657 case Intrinsic::aarch64_sve_sdiv_u:
20658 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20659 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20660 case Intrinsic::aarch64_sve_udiv_u:
20661 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20662 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20663 case Intrinsic::aarch64_sve_sqadd:
20664 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20665 case Intrinsic::aarch64_sve_sqsub_u:
20666 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20667 N->getOperand(2), N->getOperand(3));
20668 case Intrinsic::aarch64_sve_uqadd:
20669 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20670 case Intrinsic::aarch64_sve_uqsub_u:
20671 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20672 N->getOperand(2), N->getOperand(3));
20673 case Intrinsic::aarch64_sve_sqadd_x:
20674 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20675 N->getOperand(1), N->getOperand(2));
20676 case Intrinsic::aarch64_sve_sqsub_x:
20677 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20678 N->getOperand(1), N->getOperand(2));
20679 case Intrinsic::aarch64_sve_uqadd_x:
20680 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20681 N->getOperand(1), N->getOperand(2));
20682 case Intrinsic::aarch64_sve_uqsub_x:
20683 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20684 N->getOperand(1), N->getOperand(2));
20685 case Intrinsic::aarch64_sve_asrd:
20686 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20687 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20688 case Intrinsic::aarch64_sve_cmphs:
20689 if (!N->getOperand(2).getValueType().isFloatingPoint())
20691 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20692 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20693 break;
20694 case Intrinsic::aarch64_sve_cmphi:
20695 if (!N->getOperand(2).getValueType().isFloatingPoint())
20697 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20698 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20699 break;
20700 case Intrinsic::aarch64_sve_fcmpge:
20701 case Intrinsic::aarch64_sve_cmpge:
20703 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20704 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20705 break;
20706 case Intrinsic::aarch64_sve_fcmpgt:
20707 case Intrinsic::aarch64_sve_cmpgt:
20709 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20710 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20711 break;
20712 case Intrinsic::aarch64_sve_fcmpeq:
20713 case Intrinsic::aarch64_sve_cmpeq:
20715 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20716 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20717 break;
20718 case Intrinsic::aarch64_sve_fcmpne:
20719 case Intrinsic::aarch64_sve_cmpne:
20721 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20722 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20723 break;
20724 case Intrinsic::aarch64_sve_fcmpuo:
20726 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20727 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20728 break;
20729 case Intrinsic::aarch64_sve_fadda:
20731 case Intrinsic::aarch64_sve_faddv:
20733 case Intrinsic::aarch64_sve_fmaxnmv:
20735 case Intrinsic::aarch64_sve_fmaxv:
20737 case Intrinsic::aarch64_sve_fminnmv:
20739 case Intrinsic::aarch64_sve_fminv:
20741 case Intrinsic::aarch64_sve_sel:
20742 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20743 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20744 case Intrinsic::aarch64_sve_cmpeq_wide:
20745 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20746 case Intrinsic::aarch64_sve_cmpne_wide:
20747 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20748 case Intrinsic::aarch64_sve_cmpge_wide:
20749 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20750 case Intrinsic::aarch64_sve_cmpgt_wide:
20751 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20752 case Intrinsic::aarch64_sve_cmplt_wide:
20753 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20754 case Intrinsic::aarch64_sve_cmple_wide:
20755 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20756 case Intrinsic::aarch64_sve_cmphs_wide:
20757 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20758 case Intrinsic::aarch64_sve_cmphi_wide:
20759 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20760 case Intrinsic::aarch64_sve_cmplo_wide:
20761 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20762 case Intrinsic::aarch64_sve_cmpls_wide:
20763 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20764 case Intrinsic::aarch64_sve_ptest_any:
20765 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20767 case Intrinsic::aarch64_sve_ptest_first:
20768 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20770 case Intrinsic::aarch64_sve_ptest_last:
20771 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20773 }
20774 return SDValue();
20775}
20776
20777static bool isCheapToExtend(const SDValue &N) {
20778 unsigned OC = N->getOpcode();
20779 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20781}
20782
20783static SDValue
20785 SelectionDAG &DAG) {
20786 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20787 // we can move the sext into the arguments and have the same result. For
20788 // example, if A and B are both loads, we can make those extending loads and
20789 // avoid an extra instruction. This pattern appears often in VLS code
20790 // generation where the inputs to the setcc have a different size to the
20791 // instruction that wants to use the result of the setcc.
20792 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20793 N->getOperand(0)->getOpcode() == ISD::SETCC);
20794 const SDValue SetCC = N->getOperand(0);
20795
20796 const SDValue CCOp0 = SetCC.getOperand(0);
20797 const SDValue CCOp1 = SetCC.getOperand(1);
20798 if (!CCOp0->getValueType(0).isInteger() ||
20799 !CCOp1->getValueType(0).isInteger())
20800 return SDValue();
20801
20802 ISD::CondCode Code =
20803 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20804
20805 ISD::NodeType ExtType =
20806 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20807
20808 if (isCheapToExtend(SetCC.getOperand(0)) &&
20809 isCheapToExtend(SetCC.getOperand(1))) {
20810 const SDValue Ext1 =
20811 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20812 const SDValue Ext2 =
20813 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20814
20815 return DAG.getSetCC(
20816 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20817 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20818 }
20819
20820 return SDValue();
20821}
20822
20825 SelectionDAG &DAG) {
20826 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20827 // we can convert that DUP into another extract_high (of a bigger DUP), which
20828 // helps the backend to decide that an sabdl2 would be useful, saving a real
20829 // extract_high operation.
20830 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20831 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20832 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20833 SDNode *ABDNode = N->getOperand(0).getNode();
20834 SDValue NewABD =
20836 if (!NewABD.getNode())
20837 return SDValue();
20838
20839 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20840 }
20841
20842 if (N->getValueType(0).isFixedLengthVector() &&
20843 N->getOpcode() == ISD::SIGN_EXTEND &&
20844 N->getOperand(0)->getOpcode() == ISD::SETCC)
20845 return performSignExtendSetCCCombine(N, DCI, DAG);
20846
20847 return SDValue();
20848}
20849
20851 SDValue SplatVal, unsigned NumVecElts) {
20852 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20853 Align OrigAlignment = St.getAlign();
20854 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20855
20856 // Create scalar stores. This is at least as good as the code sequence for a
20857 // split unaligned store which is a dup.s, ext.b, and two stores.
20858 // Most of the time the three stores should be replaced by store pair
20859 // instructions (stp).
20860 SDLoc DL(&St);
20861 SDValue BasePtr = St.getBasePtr();
20862 uint64_t BaseOffset = 0;
20863
20864 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20865 SDValue NewST1 =
20866 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20867 OrigAlignment, St.getMemOperand()->getFlags());
20868
20869 // As this in ISel, we will not merge this add which may degrade results.
20870 if (BasePtr->getOpcode() == ISD::ADD &&
20871 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20872 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20873 BasePtr = BasePtr->getOperand(0);
20874 }
20875
20876 unsigned Offset = EltOffset;
20877 while (--NumVecElts) {
20878 Align Alignment = commonAlignment(OrigAlignment, Offset);
20879 SDValue OffsetPtr =
20880 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20881 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20882 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20883 PtrInfo.getWithOffset(Offset), Alignment,
20884 St.getMemOperand()->getFlags());
20885 Offset += EltOffset;
20886 }
20887 return NewST1;
20888}
20889
20890// Returns an SVE type that ContentTy can be trivially sign or zero extended
20891// into.
20892static MVT getSVEContainerType(EVT ContentTy) {
20893 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20894
20895 switch (ContentTy.getSimpleVT().SimpleTy) {
20896 default:
20897 llvm_unreachable("No known SVE container for this MVT type");
20898 case MVT::nxv2i8:
20899 case MVT::nxv2i16:
20900 case MVT::nxv2i32:
20901 case MVT::nxv2i64:
20902 case MVT::nxv2f32:
20903 case MVT::nxv2f64:
20904 return MVT::nxv2i64;
20905 case MVT::nxv4i8:
20906 case MVT::nxv4i16:
20907 case MVT::nxv4i32:
20908 case MVT::nxv4f32:
20909 return MVT::nxv4i32;
20910 case MVT::nxv8i8:
20911 case MVT::nxv8i16:
20912 case MVT::nxv8f16:
20913 case MVT::nxv8bf16:
20914 return MVT::nxv8i16;
20915 case MVT::nxv16i8:
20916 return MVT::nxv16i8;
20917 }
20918}
20919
20920static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20921 SDLoc DL(N);
20922 EVT VT = N->getValueType(0);
20923
20925 return SDValue();
20926
20927 EVT ContainerVT = VT;
20928 if (ContainerVT.isInteger())
20929 ContainerVT = getSVEContainerType(ContainerVT);
20930
20931 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20932 SDValue Ops[] = { N->getOperand(0), // Chain
20933 N->getOperand(2), // Pg
20934 N->getOperand(3), // Base
20935 DAG.getValueType(VT) };
20936
20937 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
20938 SDValue LoadChain = SDValue(Load.getNode(), 1);
20939
20940 if (ContainerVT.isInteger() && (VT != ContainerVT))
20941 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
20942
20943 return DAG.getMergeValues({ Load, LoadChain }, DL);
20944}
20945
20947 SDLoc DL(N);
20948 EVT VT = N->getValueType(0);
20949 EVT PtrTy = N->getOperand(3).getValueType();
20950
20951 EVT LoadVT = VT;
20952 if (VT.isFloatingPoint())
20953 LoadVT = VT.changeTypeToInteger();
20954
20955 auto *MINode = cast<MemIntrinsicSDNode>(N);
20956 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
20957 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
20958 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
20959 MINode->getOperand(2), PassThru,
20960 MINode->getMemoryVT(), MINode->getMemOperand(),
20962
20963 if (VT.isFloatingPoint()) {
20964 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
20965 return DAG.getMergeValues(Ops, DL);
20966 }
20967
20968 return L;
20969}
20970
20971template <unsigned Opcode>
20973 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
20975 "Unsupported opcode.");
20976 SDLoc DL(N);
20977 EVT VT = N->getValueType(0);
20978
20979 EVT LoadVT = VT;
20980 if (VT.isFloatingPoint())
20981 LoadVT = VT.changeTypeToInteger();
20982
20983 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
20984 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
20985 SDValue LoadChain = SDValue(Load.getNode(), 1);
20986
20987 if (VT.isFloatingPoint())
20988 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
20989
20990 return DAG.getMergeValues({Load, LoadChain}, DL);
20991}
20992
20994 SDLoc DL(N);
20995 SDValue Data = N->getOperand(2);
20996 EVT DataVT = Data.getValueType();
20997 EVT HwSrcVt = getSVEContainerType(DataVT);
20998 SDValue InputVT = DAG.getValueType(DataVT);
20999
21000 if (DataVT.isFloatingPoint())
21001 InputVT = DAG.getValueType(HwSrcVt);
21002
21003 SDValue SrcNew;
21004 if (Data.getValueType().isFloatingPoint())
21005 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21006 else
21007 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21008
21009 SDValue Ops[] = { N->getOperand(0), // Chain
21010 SrcNew,
21011 N->getOperand(4), // Base
21012 N->getOperand(3), // Pg
21013 InputVT
21014 };
21015
21016 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21017}
21018
21020 SDLoc DL(N);
21021
21022 SDValue Data = N->getOperand(2);
21023 EVT DataVT = Data.getValueType();
21024 EVT PtrTy = N->getOperand(4).getValueType();
21025
21026 if (DataVT.isFloatingPoint())
21028
21029 auto *MINode = cast<MemIntrinsicSDNode>(N);
21030 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21031 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21032 MINode->getMemoryVT(), MINode->getMemOperand(),
21033 ISD::UNINDEXED, false, false);
21034}
21035
21036/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21037/// load store optimizer pass will merge them to store pair stores. This should
21038/// be better than a movi to create the vector zero followed by a vector store
21039/// if the zero constant is not re-used, since one instructions and one register
21040/// live range will be removed.
21041///
21042/// For example, the final generated code should be:
21043///
21044/// stp xzr, xzr, [x0]
21045///
21046/// instead of:
21047///
21048/// movi v0.2d, #0
21049/// str q0, [x0]
21050///
21052 SDValue StVal = St.getValue();
21053 EVT VT = StVal.getValueType();
21054
21055 // Avoid scalarizing zero splat stores for scalable vectors.
21056 if (VT.isScalableVector())
21057 return SDValue();
21058
21059 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21060 // 2, 3 or 4 i32 elements.
21061 int NumVecElts = VT.getVectorNumElements();
21062 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21063 VT.getVectorElementType().getSizeInBits() == 64) ||
21064 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21065 VT.getVectorElementType().getSizeInBits() == 32)))
21066 return SDValue();
21067
21068 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21069 return SDValue();
21070
21071 // If the zero constant has more than one use then the vector store could be
21072 // better since the constant mov will be amortized and stp q instructions
21073 // should be able to be formed.
21074 if (!StVal.hasOneUse())
21075 return SDValue();
21076
21077 // If the store is truncating then it's going down to i16 or smaller, which
21078 // means it can be implemented in a single store anyway.
21079 if (St.isTruncatingStore())
21080 return SDValue();
21081
21082 // If the immediate offset of the address operand is too large for the stp
21083 // instruction, then bail out.
21084 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21085 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21086 if (Offset < -512 || Offset > 504)
21087 return SDValue();
21088 }
21089
21090 for (int I = 0; I < NumVecElts; ++I) {
21091 SDValue EltVal = StVal.getOperand(I);
21092 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21093 return SDValue();
21094 }
21095
21096 // Use a CopyFromReg WZR/XZR here to prevent
21097 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21098 SDLoc DL(&St);
21099 unsigned ZeroReg;
21100 EVT ZeroVT;
21101 if (VT.getVectorElementType().getSizeInBits() == 32) {
21102 ZeroReg = AArch64::WZR;
21103 ZeroVT = MVT::i32;
21104 } else {
21105 ZeroReg = AArch64::XZR;
21106 ZeroVT = MVT::i64;
21107 }
21108 SDValue SplatVal =
21109 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21110 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21111}
21112
21113/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21114/// value. The load store optimizer pass will merge them to store pair stores.
21115/// This has better performance than a splat of the scalar followed by a split
21116/// vector store. Even if the stores are not merged it is four stores vs a dup,
21117/// followed by an ext.b and two stores.
21119 SDValue StVal = St.getValue();
21120 EVT VT = StVal.getValueType();
21121
21122 // Don't replace floating point stores, they possibly won't be transformed to
21123 // stp because of the store pair suppress pass.
21124 if (VT.isFloatingPoint())
21125 return SDValue();
21126
21127 // We can express a splat as store pair(s) for 2 or 4 elements.
21128 unsigned NumVecElts = VT.getVectorNumElements();
21129 if (NumVecElts != 4 && NumVecElts != 2)
21130 return SDValue();
21131
21132 // If the store is truncating then it's going down to i16 or smaller, which
21133 // means it can be implemented in a single store anyway.
21134 if (St.isTruncatingStore())
21135 return SDValue();
21136
21137 // Check that this is a splat.
21138 // Make sure that each of the relevant vector element locations are inserted
21139 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21140 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21141 SDValue SplatVal;
21142 for (unsigned I = 0; I < NumVecElts; ++I) {
21143 // Check for insert vector elements.
21144 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21145 return SDValue();
21146
21147 // Check that same value is inserted at each vector element.
21148 if (I == 0)
21149 SplatVal = StVal.getOperand(1);
21150 else if (StVal.getOperand(1) != SplatVal)
21151 return SDValue();
21152
21153 // Check insert element index.
21154 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21155 if (!CIndex)
21156 return SDValue();
21157 uint64_t IndexVal = CIndex->getZExtValue();
21158 if (IndexVal >= NumVecElts)
21159 return SDValue();
21160 IndexNotInserted.reset(IndexVal);
21161
21162 StVal = StVal.getOperand(0);
21163 }
21164 // Check that all vector element locations were inserted to.
21165 if (IndexNotInserted.any())
21166 return SDValue();
21167
21168 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21169}
21170
21172 SelectionDAG &DAG,
21173 const AArch64Subtarget *Subtarget) {
21174
21175 StoreSDNode *S = cast<StoreSDNode>(N);
21176 if (S->isVolatile() || S->isIndexed())
21177 return SDValue();
21178
21179 SDValue StVal = S->getValue();
21180 EVT VT = StVal.getValueType();
21181
21182 if (!VT.isFixedLengthVector())
21183 return SDValue();
21184
21185 // If we get a splat of zeros, convert this vector store to a store of
21186 // scalars. They will be merged into store pairs of xzr thereby removing one
21187 // instruction and one register.
21188 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21189 return ReplacedZeroSplat;
21190
21191 // FIXME: The logic for deciding if an unaligned store should be split should
21192 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21193 // a call to that function here.
21194
21195 if (!Subtarget->isMisaligned128StoreSlow())
21196 return SDValue();
21197
21198 // Don't split at -Oz.
21200 return SDValue();
21201
21202 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21203 // those up regresses performance on micro-benchmarks and olden/bh.
21204 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21205 return SDValue();
21206
21207 // Split unaligned 16B stores. They are terrible for performance.
21208 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21209 // extensions can use this to mark that it does not want splitting to happen
21210 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21211 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21212 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21213 S->getAlign() <= Align(2))
21214 return SDValue();
21215
21216 // If we get a splat of a scalar convert this vector store to a store of
21217 // scalars. They will be merged into store pairs thereby removing two
21218 // instructions.
21219 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21220 return ReplacedSplat;
21221
21222 SDLoc DL(S);
21223
21224 // Split VT into two.
21225 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21226 unsigned NumElts = HalfVT.getVectorNumElements();
21227 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21228 DAG.getConstant(0, DL, MVT::i64));
21229 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21230 DAG.getConstant(NumElts, DL, MVT::i64));
21231 SDValue BasePtr = S->getBasePtr();
21232 SDValue NewST1 =
21233 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21234 S->getAlign(), S->getMemOperand()->getFlags());
21235 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21236 DAG.getConstant(8, DL, MVT::i64));
21237 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21238 S->getPointerInfo(), S->getAlign(),
21239 S->getMemOperand()->getFlags());
21240}
21241
21243 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21244
21245 // splice(pg, op1, undef) -> op1
21246 if (N->getOperand(2).isUndef())
21247 return N->getOperand(1);
21248
21249 return SDValue();
21250}
21251
21253 const AArch64Subtarget *Subtarget) {
21254 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21255 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21256 "Unexpected Opcode!");
21257
21258 // uunpklo/hi undef -> undef
21259 if (N->getOperand(0).isUndef())
21260 return DAG.getUNDEF(N->getValueType(0));
21261
21262 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21263 // extending load. We can do this even if this is already a masked
21264 // {z,}extload.
21265 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21266 N->getOpcode() == AArch64ISD::UUNPKLO) {
21267 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21268 SDValue Mask = MLD->getMask();
21269 SDLoc DL(N);
21270
21271 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21272 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21273 (MLD->getPassThru()->isUndef() ||
21274 isZerosVector(MLD->getPassThru().getNode()))) {
21275 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21276 unsigned PgPattern = Mask->getConstantOperandVal(0);
21277 EVT VT = N->getValueType(0);
21278
21279 // Ensure we can double the size of the predicate pattern
21280 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21281 if (NumElts &&
21282 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21283 Mask =
21284 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21285 SDValue PassThru = DAG.getConstant(0, DL, VT);
21286 SDValue NewLoad = DAG.getMaskedLoad(
21287 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21288 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21290
21291 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21292
21293 return NewLoad;
21294 }
21295 }
21296 }
21297
21298 return SDValue();
21299}
21300
21302 if (N->getOpcode() != AArch64ISD::UZP1)
21303 return false;
21304 SDValue Op0 = N->getOperand(0);
21305 EVT SrcVT = Op0->getValueType(0);
21306 EVT DstVT = N->getValueType(0);
21307 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21308 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21309 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21310}
21311
21312// Try to combine rounding shifts where the operands come from an extend, and
21313// the result is truncated and combined into one vector.
21314// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21316 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21317 SDValue Op0 = N->getOperand(0);
21318 SDValue Op1 = N->getOperand(1);
21319 EVT ResVT = N->getValueType(0);
21320
21321 unsigned RshOpc = Op0.getOpcode();
21322 if (RshOpc != AArch64ISD::RSHRNB_I)
21323 return SDValue();
21324
21325 // Same op code and imm value?
21326 SDValue ShiftValue = Op0.getOperand(1);
21327 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21328 return SDValue();
21329
21330 // Same unextended operand value?
21331 SDValue Lo = Op0.getOperand(0);
21332 SDValue Hi = Op1.getOperand(0);
21333 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21334 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21335 return SDValue();
21336 SDValue OrigArg = Lo.getOperand(0);
21337 if (OrigArg != Hi.getOperand(0))
21338 return SDValue();
21339
21340 SDLoc DL(N);
21341 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21342 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21343 ShiftValue);
21344}
21345
21346// Try to simplify:
21347// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21348// t2 = nxv8i16 srl(t1, ShiftValue)
21349// to
21350// t1 = nxv8i16 rshrnb(X, shiftvalue).
21351// rshrnb will zero the top half bits of each element. Therefore, this combine
21352// should only be performed when a following instruction with the rshrnb
21353// as an operand does not care about the top half of each element. For example,
21354// a uzp1 or a truncating store.
21356 const AArch64Subtarget *Subtarget) {
21357 EVT VT = Srl->getValueType(0);
21358 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21359 return SDValue();
21360
21361 EVT ResVT;
21362 if (VT == MVT::nxv8i16)
21363 ResVT = MVT::nxv16i8;
21364 else if (VT == MVT::nxv4i32)
21365 ResVT = MVT::nxv8i16;
21366 else if (VT == MVT::nxv2i64)
21367 ResVT = MVT::nxv4i32;
21368 else
21369 return SDValue();
21370
21371 SDLoc DL(Srl);
21372 unsigned ShiftValue;
21373 SDValue RShOperand;
21374 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21375 return SDValue();
21376 SDValue Rshrnb = DAG.getNode(
21377 AArch64ISD::RSHRNB_I, DL, ResVT,
21378 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21379 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21380}
21381
21383 const AArch64Subtarget *Subtarget) {
21384 SDLoc DL(N);
21385 SDValue Op0 = N->getOperand(0);
21386 SDValue Op1 = N->getOperand(1);
21387 EVT ResVT = N->getValueType(0);
21388
21389 // uzp1(x, undef) -> concat(truncate(x), undef)
21390 if (Op1.getOpcode() == ISD::UNDEF) {
21391 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21392 switch (ResVT.getSimpleVT().SimpleTy) {
21393 default:
21394 break;
21395 case MVT::v16i8:
21396 BCVT = MVT::v8i16;
21397 HalfVT = MVT::v8i8;
21398 break;
21399 case MVT::v8i16:
21400 BCVT = MVT::v4i32;
21401 HalfVT = MVT::v4i16;
21402 break;
21403 case MVT::v4i32:
21404 BCVT = MVT::v2i64;
21405 HalfVT = MVT::v2i32;
21406 break;
21407 }
21408 if (BCVT != MVT::Other) {
21409 SDValue BC = DAG.getBitcast(BCVT, Op0);
21410 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21411 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21412 DAG.getUNDEF(HalfVT));
21413 }
21414 }
21415
21416 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21417 return Urshr;
21418
21419 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21420 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21421
21422 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21423 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21424
21425 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21426 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21427 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21428 SDValue X = Op0.getOperand(0).getOperand(0);
21429 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21430 }
21431 }
21432
21433 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21434 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21435 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21436 SDValue Z = Op1.getOperand(0).getOperand(1);
21437 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21438 }
21439 }
21440
21441 // These optimizations only work on little endian.
21442 if (!DAG.getDataLayout().isLittleEndian())
21443 return SDValue();
21444
21445 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21446 // Example:
21447 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21448 // to
21449 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21451 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21452 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21453 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21454 Op1.getOperand(0));
21455 }
21456 }
21457
21458 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21459 return SDValue();
21460
21461 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21462 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21463
21464 // truncating uzp1(x, y) -> xtn(concat (x, y))
21465 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21466 EVT Op0Ty = SourceOp0.getValueType();
21467 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21468 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21469 SDValue Concat =
21472 SourceOp0, SourceOp1);
21473 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21474 }
21475 }
21476
21477 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21478 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21479 SourceOp1.getOpcode() != ISD::TRUNCATE)
21480 return SDValue();
21481 SourceOp0 = SourceOp0.getOperand(0);
21482 SourceOp1 = SourceOp1.getOperand(0);
21483
21484 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21485 !SourceOp0.getValueType().isSimple())
21486 return SDValue();
21487
21488 EVT ResultTy;
21489
21490 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21491 case MVT::v2i64:
21492 ResultTy = MVT::v4i32;
21493 break;
21494 case MVT::v4i32:
21495 ResultTy = MVT::v8i16;
21496 break;
21497 case MVT::v8i16:
21498 ResultTy = MVT::v16i8;
21499 break;
21500 default:
21501 return SDValue();
21502 }
21503
21504 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21505 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21506 SDValue UzpResult =
21507 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21508
21509 EVT BitcastResultTy;
21510
21511 switch (ResVT.getSimpleVT().SimpleTy) {
21512 case MVT::v2i32:
21513 BitcastResultTy = MVT::v2i64;
21514 break;
21515 case MVT::v4i16:
21516 BitcastResultTy = MVT::v4i32;
21517 break;
21518 case MVT::v8i8:
21519 BitcastResultTy = MVT::v8i16;
21520 break;
21521 default:
21522 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21523 }
21524
21525 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21526 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21527}
21528
21530 unsigned Opc = N->getOpcode();
21531
21532 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21534 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21536 "Invalid opcode.");
21537
21538 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21540 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21542 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21546
21547 SDLoc DL(N);
21548 SDValue Chain = N->getOperand(0);
21549 SDValue Pg = N->getOperand(1);
21550 SDValue Base = N->getOperand(2);
21551 SDValue Offset = N->getOperand(3);
21552 SDValue Ty = N->getOperand(4);
21553
21554 EVT ResVT = N->getValueType(0);
21555
21556 const auto OffsetOpc = Offset.getOpcode();
21557 const bool OffsetIsZExt =
21559 const bool OffsetIsSExt =
21561
21562 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21563 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21564 SDValue ExtPg = Offset.getOperand(0);
21565 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21566 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21567
21568 // If the predicate for the sign- or zero-extended offset is the
21569 // same as the predicate used for this load and the sign-/zero-extension
21570 // was from a 32-bits...
21571 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21572 SDValue UnextendedOffset = Offset.getOperand(1);
21573
21574 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21575 if (Signed)
21576 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21577
21578 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21579 {Chain, Pg, Base, UnextendedOffset, Ty});
21580 }
21581 }
21582
21583 return SDValue();
21584}
21585
21586/// Optimize a vector shift instruction and its operand if shifted out
21587/// bits are not used.
21589 const AArch64TargetLowering &TLI,
21591 assert(N->getOpcode() == AArch64ISD::VASHR ||
21592 N->getOpcode() == AArch64ISD::VLSHR);
21593
21594 SDValue Op = N->getOperand(0);
21595 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21596
21597 unsigned ShiftImm = N->getConstantOperandVal(1);
21598 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21599
21600 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21601 if (N->getOpcode() == AArch64ISD::VASHR &&
21602 Op.getOpcode() == AArch64ISD::VSHL &&
21603 N->getOperand(1) == Op.getOperand(1))
21604 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21605 return Op.getOperand(0);
21606
21607 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21608 APInt DemandedMask = ~ShiftedOutBits;
21609
21610 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21611 return SDValue(N, 0);
21612
21613 return SDValue();
21614}
21615
21617 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21618 // This transform works in partnership with performSetCCPunpkCombine to
21619 // remove unnecessary transfer of predicates into standard registers and back
21620 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21621 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21622 MVT::i1) {
21623 SDValue CC = N->getOperand(0)->getOperand(0);
21624 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21625 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21626 DAG.getVectorIdxConstant(0, SDLoc(N)));
21627 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21628 }
21629
21630 return SDValue();
21631}
21632
21633/// Target-specific DAG combine function for post-increment LD1 (lane) and
21634/// post-increment LD1R.
21637 bool IsLaneOp) {
21638 if (DCI.isBeforeLegalizeOps())
21639 return SDValue();
21640
21641 SelectionDAG &DAG = DCI.DAG;
21642 EVT VT = N->getValueType(0);
21643
21644 if (!VT.is128BitVector() && !VT.is64BitVector())
21645 return SDValue();
21646
21647 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21648 SDNode *LD = N->getOperand(LoadIdx).getNode();
21649 // If it is not LOAD, can not do such combine.
21650 if (LD->getOpcode() != ISD::LOAD)
21651 return SDValue();
21652
21653 // The vector lane must be a constant in the LD1LANE opcode.
21654 SDValue Lane;
21655 if (IsLaneOp) {
21656 Lane = N->getOperand(2);
21657 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21658 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21659 return SDValue();
21660 }
21661
21662 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21663 EVT MemVT = LoadSDN->getMemoryVT();
21664 // Check if memory operand is the same type as the vector element.
21665 if (MemVT != VT.getVectorElementType())
21666 return SDValue();
21667
21668 // Check if there are other uses. If so, do not combine as it will introduce
21669 // an extra load.
21670 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21671 ++UI) {
21672 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21673 continue;
21674 if (*UI != N)
21675 return SDValue();
21676 }
21677
21678 // If there is one use and it can splat the value, prefer that operation.
21679 // TODO: This could be expanded to more operations if they reliably use the
21680 // index variants.
21681 if (N->hasOneUse()) {
21682 unsigned UseOpc = N->use_begin()->getOpcode();
21683 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21684 return SDValue();
21685 }
21686
21687 SDValue Addr = LD->getOperand(1);
21688 SDValue Vector = N->getOperand(0);
21689 // Search for a use of the address operand that is an increment.
21690 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21691 Addr.getNode()->use_end(); UI != UE; ++UI) {
21692 SDNode *User = *UI;
21693 if (User->getOpcode() != ISD::ADD
21694 || UI.getUse().getResNo() != Addr.getResNo())
21695 continue;
21696
21697 // If the increment is a constant, it must match the memory ref size.
21698 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21699 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21700 uint32_t IncVal = CInc->getZExtValue();
21701 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21702 if (IncVal != NumBytes)
21703 continue;
21704 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21705 }
21706
21707 // To avoid cycle construction make sure that neither the load nor the add
21708 // are predecessors to each other or the Vector.
21711 Visited.insert(Addr.getNode());
21712 Worklist.push_back(User);
21713 Worklist.push_back(LD);
21714 Worklist.push_back(Vector.getNode());
21715 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21716 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21717 continue;
21718
21720 Ops.push_back(LD->getOperand(0)); // Chain
21721 if (IsLaneOp) {
21722 Ops.push_back(Vector); // The vector to be inserted
21723 Ops.push_back(Lane); // The lane to be inserted in the vector
21724 }
21725 Ops.push_back(Addr);
21726 Ops.push_back(Inc);
21727
21728 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21729 SDVTList SDTys = DAG.getVTList(Tys);
21730 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21731 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21732 MemVT,
21733 LoadSDN->getMemOperand());
21734
21735 // Update the uses.
21736 SDValue NewResults[] = {
21737 SDValue(LD, 0), // The result of load
21738 SDValue(UpdN.getNode(), 2) // Chain
21739 };
21740 DCI.CombineTo(LD, NewResults);
21741 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21742 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21743
21744 break;
21745 }
21746 return SDValue();
21747}
21748
21749/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21750/// address translation.
21753 SelectionDAG &DAG) {
21754 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21755 KnownBits Known;
21757 !DCI.isBeforeLegalizeOps());
21758 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21759 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21760 DCI.CommitTargetLoweringOpt(TLO);
21761 return true;
21762 }
21763 return false;
21764}
21765
21767 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21768 "Expected STORE dag node in input!");
21769
21770 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21771 if (!Store->isTruncatingStore() || Store->isIndexed())
21772 return SDValue();
21773 SDValue Ext = Store->getValue();
21774 auto ExtOpCode = Ext.getOpcode();
21775 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21776 ExtOpCode != ISD::ANY_EXTEND)
21777 return SDValue();
21778 SDValue Orig = Ext->getOperand(0);
21779 if (Store->getMemoryVT() != Orig.getValueType())
21780 return SDValue();
21781 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21782 Store->getBasePtr(), Store->getMemOperand());
21783 }
21784
21785 return SDValue();
21786}
21787
21788// A custom combine to lower load <3 x i8> as the more efficient sequence
21789// below:
21790// ldrb wX, [x0, #2]
21791// ldrh wY, [x0]
21792// orr wX, wY, wX, lsl #16
21793// fmov s0, wX
21794//
21795// Note that an alternative sequence with even fewer (although usually more
21796// complex/expensive) instructions would be:
21797// ld1r.4h { v0 }, [x0], #2
21798// ld1.b { v0 }[2], [x0]
21799//
21800// Generating this sequence unfortunately results in noticeably worse codegen
21801// for code that extends the loaded v3i8, due to legalization breaking vector
21802// shuffle detection in a way that is very difficult to work around.
21803// TODO: Revisit once v3i8 legalization has been improved in general.
21805 EVT MemVT = LD->getMemoryVT();
21806 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21807 LD->getOriginalAlign() >= 4)
21808 return SDValue();
21809
21810 SDLoc DL(LD);
21812 SDValue Chain = LD->getChain();
21813 SDValue BasePtr = LD->getBasePtr();
21814 MachineMemOperand *MMO = LD->getMemOperand();
21815 assert(LD->getOffset().isUndef() && "undef offset expected");
21816
21817 // Load 2 x i8, then 1 x i8.
21818 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21819 TypeSize Offset2 = TypeSize::getFixed(2);
21820 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21821 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21822 MF.getMachineMemOperand(MMO, 2, 1));
21823
21824 // Extend to i32.
21825 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21826 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21827
21828 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21829 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21830 DAG.getConstant(16, DL, MVT::i32));
21831 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21832 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21833
21834 // Extract v3i8 again.
21835 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21836 DAG.getConstant(0, DL, MVT::i64));
21837 SDValue TokenFactor = DAG.getNode(
21838 ISD::TokenFactor, DL, MVT::Other,
21839 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21840 return DAG.getMergeValues({Extract, TokenFactor}, DL);
21841}
21842
21843// Perform TBI simplification if supported by the target and try to break up
21844// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21845// load instructions can be selected.
21848 SelectionDAG &DAG,
21849 const AArch64Subtarget *Subtarget) {
21850 if (Subtarget->supportsAddressTopByteIgnored())
21851 performTBISimplification(N->getOperand(1), DCI, DAG);
21852
21853 LoadSDNode *LD = cast<LoadSDNode>(N);
21854 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21855 return SDValue(N, 0);
21856
21857 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21858 return Res;
21859
21860 if (!LD->isNonTemporal())
21861 return SDValue(N, 0);
21862
21863 EVT MemVT = LD->getMemoryVT();
21864 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21865 MemVT.getSizeInBits() % 256 == 0 ||
21866 256 % MemVT.getScalarSizeInBits() != 0)
21867 return SDValue(N, 0);
21868
21869 SDLoc DL(LD);
21870 SDValue Chain = LD->getChain();
21871 SDValue BasePtr = LD->getBasePtr();
21872 SDNodeFlags Flags = LD->getFlags();
21874 SmallVector<SDValue, 4> LoadOpsChain;
21875 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21876 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21877 // loads and reduce the amount of load instructions generated.
21878 MVT NewVT =
21880 256 / MemVT.getVectorElementType().getSizeInBits());
21881 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21882 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21883 for (unsigned I = 0; I < Num256Loads; I++) {
21884 unsigned PtrOffset = I * 32;
21885 SDValue NewPtr = DAG.getMemBasePlusOffset(
21886 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21887 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21888 SDValue NewLoad = DAG.getLoad(
21889 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21890 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21891 LoadOps.push_back(NewLoad);
21892 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21893 }
21894
21895 // Process remaining bits of the load operation.
21896 // This is done by creating an UNDEF vector to match the size of the
21897 // 256-bit loads and inserting the remaining load to it. We extract the
21898 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21899 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21900 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21901 MVT RemainingVT = MVT::getVectorVT(
21903 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21904 SDValue NewPtr = DAG.getMemBasePlusOffset(
21905 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21906 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21907 SDValue RemainingLoad =
21908 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21909 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21910 LD->getMemOperand()->getFlags(), LD->getAAInfo());
21911 SDValue UndefVector = DAG.getUNDEF(NewVT);
21912 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21913 SDValue ExtendedReminingLoad =
21914 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21915 {UndefVector, RemainingLoad, InsertIdx});
21916 LoadOps.push_back(ExtendedReminingLoad);
21917 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
21918 EVT ConcatVT =
21920 LoadOps.size() * NewVT.getVectorNumElements());
21921 SDValue ConcatVectors =
21922 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
21923 // Extract the original vector type size.
21924 SDValue ExtractSubVector =
21925 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
21926 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
21927 SDValue TokenFactor =
21928 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21929 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
21930}
21931
21933 EVT VecVT = Op.getValueType();
21934 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
21935 "Need boolean vector type.");
21936
21937 if (Depth > 3)
21939
21940 // We can get the base type from a vector compare or truncate.
21941 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
21942 return Op.getOperand(0).getValueType();
21943
21944 // If an operand is a bool vector, continue looking.
21946 for (SDValue Operand : Op->op_values()) {
21947 if (Operand.getValueType() != VecVT)
21948 continue;
21949
21950 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
21951 if (!BaseVT.isSimple())
21952 BaseVT = OperandVT;
21953 else if (OperandVT != BaseVT)
21955 }
21956
21957 return BaseVT;
21958}
21959
21960// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
21961// iN, we can use a trick that extracts the i^th bit from the i^th element and
21962// then performs a vector add to get a scalar bitmask. This requires that each
21963// element's bits are either all 1 or all 0.
21965 SDLoc DL(N);
21966 SDValue ComparisonResult(N, 0);
21967 EVT VecVT = ComparisonResult.getValueType();
21968 assert(VecVT.isVector() && "Must be a vector type");
21969
21970 unsigned NumElts = VecVT.getVectorNumElements();
21971 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
21972 return SDValue();
21973
21974 if (VecVT.getVectorElementType() != MVT::i1 &&
21975 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
21976 return SDValue();
21977
21978 // If we can find the original types to work on instead of a vector of i1,
21979 // we can avoid extend/extract conversion instructions.
21980 if (VecVT.getVectorElementType() == MVT::i1) {
21981 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
21982 if (!VecVT.isSimple()) {
21983 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
21984 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
21985 }
21986 }
21987 VecVT = VecVT.changeVectorElementTypeToInteger();
21988
21989 // Large vectors don't map directly to this conversion, so to avoid too many
21990 // edge cases, we don't apply it here. The conversion will likely still be
21991 // applied later via multiple smaller vectors, whose results are concatenated.
21992 if (VecVT.getSizeInBits() > 128)
21993 return SDValue();
21994
21995 // Ensure that all elements' bits are either 0s or 1s.
21996 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
21997
21998 SmallVector<SDValue, 16> MaskConstants;
21999 if (VecVT == MVT::v16i8) {
22000 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22001 // per entry. We split it into two halves, apply the mask, zip the halves to
22002 // create 8x 16-bit values, and the perform the vector reduce.
22003 for (unsigned Half = 0; Half < 2; ++Half) {
22004 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22005 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22006 }
22007 }
22008 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22009 SDValue RepresentativeBits =
22010 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22011
22012 SDValue UpperRepresentativeBits =
22013 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22014 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22015 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22016 RepresentativeBits, UpperRepresentativeBits);
22017 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22018 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22019 }
22020
22021 // All other vector sizes.
22022 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22023 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22024 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22025 }
22026
22027 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22028 SDValue RepresentativeBits =
22029 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22030 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22031 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22032 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22033}
22034
22036 StoreSDNode *Store) {
22037 if (!Store->isTruncatingStore())
22038 return SDValue();
22039
22040 SDLoc DL(Store);
22041 SDValue VecOp = Store->getValue();
22042 EVT VT = VecOp.getValueType();
22043 EVT MemVT = Store->getMemoryVT();
22044
22045 if (!MemVT.isVector() || !VT.isVector() ||
22046 MemVT.getVectorElementType() != MVT::i1)
22047 return SDValue();
22048
22049 // If we are storing a vector that we are currently building, let
22050 // `scalarizeVectorStore()` handle this more efficiently.
22051 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22052 return SDValue();
22053
22054 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22055 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22056 if (!VectorBits)
22057 return SDValue();
22058
22059 EVT StoreVT =
22061 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22062 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22063 Store->getMemOperand());
22064}
22065
22067 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22068 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22069 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22070}
22071
22072// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22074 const AArch64Subtarget *Subtarget) {
22075 SDValue Value = ST->getValue();
22076 EVT ValueVT = Value.getValueType();
22077
22078 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22079 Value.getOpcode() != ISD::TRUNCATE ||
22080 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22081 return SDValue();
22082
22083 assert(ST->getOffset().isUndef() && "undef offset expected");
22084 SDLoc DL(ST);
22085 auto WideVT = EVT::getVectorVT(
22086 *DAG.getContext(),
22087 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22088 SDValue UndefVector = DAG.getUNDEF(WideVT);
22089 SDValue WideTrunc = DAG.getNode(
22090 ISD::INSERT_SUBVECTOR, DL, WideVT,
22091 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22092 SDValue Cast = DAG.getNode(
22093 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22094 WideTrunc);
22095
22097 SDValue Chain = ST->getChain();
22098 MachineMemOperand *MMO = ST->getMemOperand();
22099 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22100 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22101 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22102 TypeSize Offset2 = TypeSize::getFixed(2);
22103 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22104 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22105
22106 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22107 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22108 TypeSize Offset1 = TypeSize::getFixed(1);
22109 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22110 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22111
22112 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22113 DAG.getConstant(0, DL, MVT::i64));
22114 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22115 MF.getMachineMemOperand(MMO, 0, 1));
22116 return Chain;
22117}
22118
22121 SelectionDAG &DAG,
22122 const AArch64Subtarget *Subtarget) {
22123 StoreSDNode *ST = cast<StoreSDNode>(N);
22124 SDValue Chain = ST->getChain();
22125 SDValue Value = ST->getValue();
22126 SDValue Ptr = ST->getBasePtr();
22127 EVT ValueVT = Value.getValueType();
22128
22129 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22130 EVT EltVT = VT.getVectorElementType();
22131 return EltVT == MVT::f32 || EltVT == MVT::f64;
22132 };
22133
22134 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22135 return Res;
22136
22137 // If this is an FP_ROUND followed by a store, fold this into a truncating
22138 // store. We can do this even if this is already a truncstore.
22139 // We purposefully don't care about legality of the nodes here as we know
22140 // they can be split down into something legal.
22141 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22142 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22143 Subtarget->useSVEForFixedLengthVectors() &&
22144 ValueVT.isFixedLengthVector() &&
22145 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22146 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22147 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22148 ST->getMemoryVT(), ST->getMemOperand());
22149
22150 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22151 return Split;
22152
22153 if (Subtarget->supportsAddressTopByteIgnored() &&
22154 performTBISimplification(N->getOperand(2), DCI, DAG))
22155 return SDValue(N, 0);
22156
22157 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22158 return Store;
22159
22160 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22161 return Store;
22162
22163 if (ST->isTruncatingStore()) {
22164 EVT StoreVT = ST->getMemoryVT();
22165 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22166 return SDValue();
22167 if (SDValue Rshrnb =
22168 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22169 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22170 StoreVT, ST->getMemOperand());
22171 }
22172 }
22173
22174 return SDValue();
22175}
22176
22179 SelectionDAG &DAG,
22180 const AArch64Subtarget *Subtarget) {
22181 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22182 SDValue Value = MST->getValue();
22183 SDValue Mask = MST->getMask();
22184 SDLoc DL(N);
22185
22186 // If this is a UZP1 followed by a masked store, fold this into a masked
22187 // truncating store. We can do this even if this is already a masked
22188 // truncstore.
22189 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22190 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22191 Value.getValueType().isInteger()) {
22192 Value = Value.getOperand(0);
22193 if (Value.getOpcode() == ISD::BITCAST) {
22194 EVT HalfVT =
22195 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22196 EVT InVT = Value.getOperand(0).getValueType();
22197
22198 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22199 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22200 unsigned PgPattern = Mask->getConstantOperandVal(0);
22201
22202 // Ensure we can double the size of the predicate pattern
22203 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22204 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22205 MinSVESize) {
22206 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22207 PgPattern);
22208 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22209 MST->getBasePtr(), MST->getOffset(), Mask,
22210 MST->getMemoryVT(), MST->getMemOperand(),
22211 MST->getAddressingMode(),
22212 /*IsTruncating=*/true);
22213 }
22214 }
22215 }
22216 }
22217
22218 if (MST->isTruncatingStore()) {
22219 EVT ValueVT = Value->getValueType(0);
22220 EVT MemVT = MST->getMemoryVT();
22221 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22222 return SDValue();
22223 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22224 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22225 MST->getOffset(), MST->getMask(),
22226 MST->getMemoryVT(), MST->getMemOperand(),
22227 MST->getAddressingMode(), true);
22228 }
22229 }
22230
22231 return SDValue();
22232}
22233
22234/// \return true if part of the index was folded into the Base.
22235static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22236 SDLoc DL, SelectionDAG &DAG) {
22237 // This function assumes a vector of i64 indices.
22238 EVT IndexVT = Index.getValueType();
22239 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22240 return false;
22241
22242 // Simplify:
22243 // BasePtr = Ptr
22244 // Index = X + splat(Offset)
22245 // ->
22246 // BasePtr = Ptr + Offset * scale.
22247 // Index = X
22248 if (Index.getOpcode() == ISD::ADD) {
22249 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22250 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22251 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22252 Index = Index.getOperand(0);
22253 return true;
22254 }
22255 }
22256
22257 // Simplify:
22258 // BasePtr = Ptr
22259 // Index = (X + splat(Offset)) << splat(Shift)
22260 // ->
22261 // BasePtr = Ptr + (Offset << Shift) * scale)
22262 // Index = X << splat(shift)
22263 if (Index.getOpcode() == ISD::SHL &&
22264 Index.getOperand(0).getOpcode() == ISD::ADD) {
22265 SDValue Add = Index.getOperand(0);
22266 SDValue ShiftOp = Index.getOperand(1);
22267 SDValue OffsetOp = Add.getOperand(1);
22268 if (auto Shift = DAG.getSplatValue(ShiftOp))
22269 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22270 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22271 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22272 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22273 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22274 Add.getOperand(0), ShiftOp);
22275 return true;
22276 }
22277 }
22278
22279 return false;
22280}
22281
22282// Analyse the specified address returning true if a more optimal addressing
22283// mode is available. When returning true all parameters are updated to reflect
22284// their recommended values.
22286 SDValue &BasePtr, SDValue &Index,
22287 SelectionDAG &DAG) {
22288 // Try to iteratively fold parts of the index into the base pointer to
22289 // simplify the index as much as possible.
22290 bool Changed = false;
22291 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22292 Changed = true;
22293
22294 // Only consider element types that are pointer sized as smaller types can
22295 // be easily promoted.
22296 EVT IndexVT = Index.getValueType();
22297 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22298 return Changed;
22299
22300 // Can indices be trivially shrunk?
22301 EVT DataVT = N->getOperand(1).getValueType();
22302 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22303 // will later be re-extended to 64 bits in legalization
22304 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22305 return Changed;
22306 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22307 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22308 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22309 return true;
22310 }
22311
22312 // Match:
22313 // Index = step(const)
22314 int64_t Stride = 0;
22315 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22316 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22317 }
22318 // Match:
22319 // Index = step(const) << shift(const)
22320 else if (Index.getOpcode() == ISD::SHL &&
22321 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22322 SDValue RHS = Index.getOperand(1);
22323 if (auto *Shift =
22324 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22325 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22326 Stride = Step << Shift->getZExtValue();
22327 }
22328 }
22329
22330 // Return early because no supported pattern is found.
22331 if (Stride == 0)
22332 return Changed;
22333
22334 if (Stride < std::numeric_limits<int32_t>::min() ||
22335 Stride > std::numeric_limits<int32_t>::max())
22336 return Changed;
22337
22338 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22339 unsigned MaxVScale =
22341 int64_t LastElementOffset =
22342 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22343
22344 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22345 LastElementOffset > std::numeric_limits<int32_t>::max())
22346 return Changed;
22347
22348 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22349 // Stride does not scale explicitly by 'Scale', because it happens in
22350 // the gather/scatter addressing mode.
22351 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22352 return true;
22353}
22354
22357 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22358 assert(MGS && "Can only combine gather load or scatter store nodes");
22359
22360 if (!DCI.isBeforeLegalize())
22361 return SDValue();
22362
22363 SDLoc DL(MGS);
22364 SDValue Chain = MGS->getChain();
22365 SDValue Scale = MGS->getScale();
22366 SDValue Index = MGS->getIndex();
22367 SDValue Mask = MGS->getMask();
22368 SDValue BasePtr = MGS->getBasePtr();
22369 ISD::MemIndexType IndexType = MGS->getIndexType();
22370
22371 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22372 return SDValue();
22373
22374 // Here we catch such cases early and change MGATHER's IndexType to allow
22375 // the use of an Index that's more legalisation friendly.
22376 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22377 SDValue PassThru = MGT->getPassThru();
22378 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22379 return DAG.getMaskedGather(
22380 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22381 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22382 }
22383 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22384 SDValue Data = MSC->getValue();
22385 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22386 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22387 Ops, MSC->getMemOperand(), IndexType,
22388 MSC->isTruncatingStore());
22389}
22390
22391/// Target-specific DAG combine function for NEON load/store intrinsics
22392/// to merge base address updates.
22395 SelectionDAG &DAG) {
22396 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22397 return SDValue();
22398
22399 unsigned AddrOpIdx = N->getNumOperands() - 1;
22400 SDValue Addr = N->getOperand(AddrOpIdx);
22401
22402 // Search for a use of the address operand that is an increment.
22403 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22404 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22405 SDNode *User = *UI;
22406 if (User->getOpcode() != ISD::ADD ||
22407 UI.getUse().getResNo() != Addr.getResNo())
22408 continue;
22409
22410 // Check that the add is independent of the load/store. Otherwise, folding
22411 // it would create a cycle.
22414 Visited.insert(Addr.getNode());
22415 Worklist.push_back(N);
22416 Worklist.push_back(User);
22417 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22418 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22419 continue;
22420
22421 // Find the new opcode for the updating load/store.
22422 bool IsStore = false;
22423 bool IsLaneOp = false;
22424 bool IsDupOp = false;
22425 unsigned NewOpc = 0;
22426 unsigned NumVecs = 0;
22427 unsigned IntNo = N->getConstantOperandVal(1);
22428 switch (IntNo) {
22429 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22430 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22431 NumVecs = 2; break;
22432 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22433 NumVecs = 3; break;
22434 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22435 NumVecs = 4; break;
22436 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22437 NumVecs = 2; IsStore = true; break;
22438 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22439 NumVecs = 3; IsStore = true; break;
22440 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22441 NumVecs = 4; IsStore = true; break;
22442 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22443 NumVecs = 2; break;
22444 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22445 NumVecs = 3; break;
22446 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22447 NumVecs = 4; break;
22448 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22449 NumVecs = 2; IsStore = true; break;
22450 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22451 NumVecs = 3; IsStore = true; break;
22452 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22453 NumVecs = 4; IsStore = true; break;
22454 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22455 NumVecs = 2; IsDupOp = true; break;
22456 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22457 NumVecs = 3; IsDupOp = true; break;
22458 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22459 NumVecs = 4; IsDupOp = true; break;
22460 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22461 NumVecs = 2; IsLaneOp = true; break;
22462 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22463 NumVecs = 3; IsLaneOp = true; break;
22464 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22465 NumVecs = 4; IsLaneOp = true; break;
22466 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22467 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22468 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22469 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22470 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22471 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22472 }
22473
22474 EVT VecTy;
22475 if (IsStore)
22476 VecTy = N->getOperand(2).getValueType();
22477 else
22478 VecTy = N->getValueType(0);
22479
22480 // If the increment is a constant, it must match the memory ref size.
22481 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22482 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22483 uint32_t IncVal = CInc->getZExtValue();
22484 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22485 if (IsLaneOp || IsDupOp)
22486 NumBytes /= VecTy.getVectorNumElements();
22487 if (IncVal != NumBytes)
22488 continue;
22489 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22490 }
22492 Ops.push_back(N->getOperand(0)); // Incoming chain
22493 // Load lane and store have vector list as input.
22494 if (IsLaneOp || IsStore)
22495 for (unsigned i = 2; i < AddrOpIdx; ++i)
22496 Ops.push_back(N->getOperand(i));
22497 Ops.push_back(Addr); // Base register
22498 Ops.push_back(Inc);
22499
22500 // Return Types.
22501 EVT Tys[6];
22502 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22503 unsigned n;
22504 for (n = 0; n < NumResultVecs; ++n)
22505 Tys[n] = VecTy;
22506 Tys[n++] = MVT::i64; // Type of write back register
22507 Tys[n] = MVT::Other; // Type of the chain
22508 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22509
22510 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22511 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22512 MemInt->getMemoryVT(),
22513 MemInt->getMemOperand());
22514
22515 // Update the uses.
22516 std::vector<SDValue> NewResults;
22517 for (unsigned i = 0; i < NumResultVecs; ++i) {
22518 NewResults.push_back(SDValue(UpdN.getNode(), i));
22519 }
22520 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22521 DCI.CombineTo(N, NewResults);
22522 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22523
22524 break;
22525 }
22526 return SDValue();
22527}
22528
22529// Checks to see if the value is the prescribed width and returns information
22530// about its extension mode.
22531static
22532bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22533 ExtType = ISD::NON_EXTLOAD;
22534 switch(V.getNode()->getOpcode()) {
22535 default:
22536 return false;
22537 case ISD::LOAD: {
22538 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22539 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22540 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22541 ExtType = LoadNode->getExtensionType();
22542 return true;
22543 }
22544 return false;
22545 }
22546 case ISD::AssertSext: {
22547 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22548 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22549 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22550 ExtType = ISD::SEXTLOAD;
22551 return true;
22552 }
22553 return false;
22554 }
22555 case ISD::AssertZext: {
22556 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22557 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22558 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22559 ExtType = ISD::ZEXTLOAD;
22560 return true;
22561 }
22562 return false;
22563 }
22564 case ISD::Constant:
22565 case ISD::TargetConstant: {
22566 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22567 1LL << (width - 1);
22568 }
22569 }
22570
22571 return true;
22572}
22573
22574// This function does a whole lot of voodoo to determine if the tests are
22575// equivalent without and with a mask. Essentially what happens is that given a
22576// DAG resembling:
22577//
22578// +-------------+ +-------------+ +-------------+ +-------------+
22579// | Input | | AddConstant | | CompConstant| | CC |
22580// +-------------+ +-------------+ +-------------+ +-------------+
22581// | | | |
22582// V V | +----------+
22583// +-------------+ +----+ | |
22584// | ADD | |0xff| | |
22585// +-------------+ +----+ | |
22586// | | | |
22587// V V | |
22588// +-------------+ | |
22589// | AND | | |
22590// +-------------+ | |
22591// | | |
22592// +-----+ | |
22593// | | |
22594// V V V
22595// +-------------+
22596// | CMP |
22597// +-------------+
22598//
22599// The AND node may be safely removed for some combinations of inputs. In
22600// particular we need to take into account the extension type of the Input,
22601// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22602// width of the input (this can work for any width inputs, the above graph is
22603// specific to 8 bits.
22604//
22605// The specific equations were worked out by generating output tables for each
22606// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22607// problem was simplified by working with 4 bit inputs, which means we only
22608// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22609// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22610// patterns present in both extensions (0,7). For every distinct set of
22611// AddConstant and CompConstants bit patterns we can consider the masked and
22612// unmasked versions to be equivalent if the result of this function is true for
22613// all 16 distinct bit patterns of for the current extension type of Input (w0).
22614//
22615// sub w8, w0, w1
22616// and w10, w8, #0x0f
22617// cmp w8, w2
22618// cset w9, AArch64CC
22619// cmp w10, w2
22620// cset w11, AArch64CC
22621// cmp w9, w11
22622// cset w0, eq
22623// ret
22624//
22625// Since the above function shows when the outputs are equivalent it defines
22626// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22627// would be expensive to run during compiles. The equations below were written
22628// in a test harness that confirmed they gave equivalent outputs to the above
22629// for all inputs function, so they can be used determine if the removal is
22630// legal instead.
22631//
22632// isEquivalentMaskless() is the code for testing if the AND can be removed
22633// factored out of the DAG recognition as the DAG can take several forms.
22634
22635static bool isEquivalentMaskless(unsigned CC, unsigned width,
22636 ISD::LoadExtType ExtType, int AddConstant,
22637 int CompConstant) {
22638 // By being careful about our equations and only writing the in term
22639 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22640 // make them generally applicable to all bit widths.
22641 int MaxUInt = (1 << width);
22642
22643 // For the purposes of these comparisons sign extending the type is
22644 // equivalent to zero extending the add and displacing it by half the integer
22645 // width. Provided we are careful and make sure our equations are valid over
22646 // the whole range we can just adjust the input and avoid writing equations
22647 // for sign extended inputs.
22648 if (ExtType == ISD::SEXTLOAD)
22649 AddConstant -= (1 << (width-1));
22650
22651 switch(CC) {
22652 case AArch64CC::LE:
22653 case AArch64CC::GT:
22654 if ((AddConstant == 0) ||
22655 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22656 (AddConstant >= 0 && CompConstant < 0) ||
22657 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22658 return true;
22659 break;
22660 case AArch64CC::LT:
22661 case AArch64CC::GE:
22662 if ((AddConstant == 0) ||
22663 (AddConstant >= 0 && CompConstant <= 0) ||
22664 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22665 return true;
22666 break;
22667 case AArch64CC::HI:
22668 case AArch64CC::LS:
22669 if ((AddConstant >= 0 && CompConstant < 0) ||
22670 (AddConstant <= 0 && CompConstant >= -1 &&
22671 CompConstant < AddConstant + MaxUInt))
22672 return true;
22673 break;
22674 case AArch64CC::PL:
22675 case AArch64CC::MI:
22676 if ((AddConstant == 0) ||
22677 (AddConstant > 0 && CompConstant <= 0) ||
22678 (AddConstant < 0 && CompConstant <= AddConstant))
22679 return true;
22680 break;
22681 case AArch64CC::LO:
22682 case AArch64CC::HS:
22683 if ((AddConstant >= 0 && CompConstant <= 0) ||
22684 (AddConstant <= 0 && CompConstant >= 0 &&
22685 CompConstant <= AddConstant + MaxUInt))
22686 return true;
22687 break;
22688 case AArch64CC::EQ:
22689 case AArch64CC::NE:
22690 if ((AddConstant > 0 && CompConstant < 0) ||
22691 (AddConstant < 0 && CompConstant >= 0 &&
22692 CompConstant < AddConstant + MaxUInt) ||
22693 (AddConstant >= 0 && CompConstant >= 0 &&
22694 CompConstant >= AddConstant) ||
22695 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22696 return true;
22697 break;
22698 case AArch64CC::VS:
22699 case AArch64CC::VC:
22700 case AArch64CC::AL:
22701 case AArch64CC::NV:
22702 return true;
22703 case AArch64CC::Invalid:
22704 break;
22705 }
22706
22707 return false;
22708}
22709
22710// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22711// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22713 SDNode *AndNode, SelectionDAG &DAG,
22714 unsigned CCIndex, unsigned CmpIndex,
22715 unsigned CC) {
22716 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22717 if (!SubsC)
22718 return SDValue();
22719
22720 APInt SubsAP = SubsC->getAPIntValue();
22721 if (CC == AArch64CC::HI) {
22722 if (!SubsAP.isMask())
22723 return SDValue();
22724 } else if (CC == AArch64CC::LO) {
22725 if (!SubsAP.isPowerOf2())
22726 return SDValue();
22727 } else
22728 return SDValue();
22729
22730 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22731 if (!AndC)
22732 return SDValue();
22733
22734 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22735
22736 SDLoc DL(N);
22737 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22738 SDValue ANDS = DAG.getNode(
22739 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22740 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22741 SDValue AArch64_CC =
22743 N->getOperand(CCIndex)->getValueType(0));
22744
22745 // For now, only performCSELCombine and performBRCONDCombine call this
22746 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22747 // operands. So just init the ops direct to simplify the code. If we have some
22748 // other case with different CCIndex, CmpIndex, we need to use for loop to
22749 // rewrite the code here.
22750 // TODO: Do we need to assert number of operand is 4 here?
22751 assert((CCIndex == 2 && CmpIndex == 3) &&
22752 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22753 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22754 ANDS.getValue(1)};
22755 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22756}
22757
22758static
22761 SelectionDAG &DAG, unsigned CCIndex,
22762 unsigned CmpIndex) {
22763 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22764 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22765 unsigned CondOpcode = SubsNode->getOpcode();
22766
22767 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
22768 return SDValue();
22769
22770 // There is a SUBS feeding this condition. Is it fed by a mask we can
22771 // use?
22772
22773 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22774 unsigned MaskBits = 0;
22775
22776 if (AndNode->getOpcode() != ISD::AND)
22777 return SDValue();
22778
22779 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22780 CmpIndex, CC))
22781 return Val;
22782
22783 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22784 uint32_t CNV = CN->getZExtValue();
22785 if (CNV == 255)
22786 MaskBits = 8;
22787 else if (CNV == 65535)
22788 MaskBits = 16;
22789 }
22790
22791 if (!MaskBits)
22792 return SDValue();
22793
22794 SDValue AddValue = AndNode->getOperand(0);
22795
22796 if (AddValue.getOpcode() != ISD::ADD)
22797 return SDValue();
22798
22799 // The basic dag structure is correct, grab the inputs and validate them.
22800
22801 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22802 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22803 SDValue SubsInputValue = SubsNode->getOperand(1);
22804
22805 // The mask is present and the provenance of all the values is a smaller type,
22806 // lets see if the mask is superfluous.
22807
22808 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22809 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22810 return SDValue();
22811
22812 ISD::LoadExtType ExtType;
22813
22814 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22815 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22816 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22817 return SDValue();
22818
22819 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22820 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22821 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22822 return SDValue();
22823
22824 // The AND is not necessary, remove it.
22825
22826 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22827 SubsNode->getValueType(1));
22828 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22829
22830 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22831 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22832
22833 return SDValue(N, 0);
22834}
22835
22836// Optimize compare with zero and branch.
22839 SelectionDAG &DAG) {
22841 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22842 // will not be produced, as they are conditional branch instructions that do
22843 // not set flags.
22844 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22845 return SDValue();
22846
22847 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22848 N = NV.getNode();
22849 SDValue Chain = N->getOperand(0);
22850 SDValue Dest = N->getOperand(1);
22851 SDValue CCVal = N->getOperand(2);
22852 SDValue Cmp = N->getOperand(3);
22853
22854 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22855 unsigned CC = CCVal->getAsZExtVal();
22856 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22857 return SDValue();
22858
22859 unsigned CmpOpc = Cmp.getOpcode();
22860 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22861 return SDValue();
22862
22863 // Only attempt folding if there is only one use of the flag and no use of the
22864 // value.
22865 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22866 return SDValue();
22867
22868 SDValue LHS = Cmp.getOperand(0);
22869 SDValue RHS = Cmp.getOperand(1);
22870
22871 assert(LHS.getValueType() == RHS.getValueType() &&
22872 "Expected the value type to be the same for both operands!");
22873 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22874 return SDValue();
22875
22876 if (isNullConstant(LHS))
22877 std::swap(LHS, RHS);
22878
22879 if (!isNullConstant(RHS))
22880 return SDValue();
22881
22882 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22883 LHS.getOpcode() == ISD::SRL)
22884 return SDValue();
22885
22886 // Fold the compare into the branch instruction.
22887 SDValue BR;
22888 if (CC == AArch64CC::EQ)
22889 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22890 else
22891 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22892
22893 // Do not add new nodes to DAG combiner worklist.
22894 DCI.CombineTo(N, BR, false);
22895
22896 return SDValue();
22897}
22898
22900 unsigned CC = N->getConstantOperandVal(2);
22901 SDValue SUBS = N->getOperand(3);
22902 SDValue Zero, CTTZ;
22903
22904 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22905 Zero = N->getOperand(0);
22906 CTTZ = N->getOperand(1);
22907 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22908 Zero = N->getOperand(1);
22909 CTTZ = N->getOperand(0);
22910 } else
22911 return SDValue();
22912
22913 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22914 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22915 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22916 return SDValue();
22917
22918 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22919 "Illegal type in CTTZ folding");
22920
22921 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
22922 return SDValue();
22923
22924 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22925 ? CTTZ.getOperand(0).getOperand(0)
22926 : CTTZ.getOperand(0);
22927
22928 if (X != SUBS.getOperand(0))
22929 return SDValue();
22930
22931 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22932 ? CTTZ.getOperand(0).getValueSizeInBits()
22933 : CTTZ.getValueSizeInBits();
22934 SDValue BitWidthMinusOne =
22935 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
22936 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
22937 BitWidthMinusOne);
22938}
22939
22940// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
22941// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
22942// Where x and y are constants and x != y
22943
22944// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
22945// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
22946// Where x and y are constants and x != y
22948 SDValue L = Op->getOperand(0);
22949 SDValue R = Op->getOperand(1);
22950 AArch64CC::CondCode OpCC =
22951 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
22952
22953 SDValue OpCmp = Op->getOperand(3);
22954 if (!isCMP(OpCmp))
22955 return SDValue();
22956
22957 SDValue CmpLHS = OpCmp.getOperand(0);
22958 SDValue CmpRHS = OpCmp.getOperand(1);
22959
22960 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
22961 std::swap(CmpLHS, CmpRHS);
22962 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
22963 return SDValue();
22964
22965 SDValue X = CmpLHS->getOperand(0);
22966 SDValue Y = CmpLHS->getOperand(1);
22967 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
22968 return SDValue();
22969 }
22970
22971 // If one of the constant is opaque constant, x,y sdnode is still different
22972 // but the real value maybe the same. So check APInt here to make sure the
22973 // code is correct.
22974 ConstantSDNode *CX = cast<ConstantSDNode>(X);
22975 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
22976 if (CX->getAPIntValue() == CY->getAPIntValue())
22977 return SDValue();
22978
22980 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
22981 SDValue Cond = CmpLHS->getOperand(3);
22982
22983 if (CmpRHS == Y)
22985 else if (CmpRHS != X)
22986 return SDValue();
22987
22988 if (OpCC == AArch64CC::NE)
22990 else if (OpCC != AArch64CC::EQ)
22991 return SDValue();
22992
22993 SDLoc DL(Op);
22994 EVT VT = Op->getValueType(0);
22995
22996 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
22997 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
22998}
22999
23000// Optimize CSEL instructions
23003 SelectionDAG &DAG) {
23004 // CSEL x, x, cc -> x
23005 if (N->getOperand(0) == N->getOperand(1))
23006 return N->getOperand(0);
23007
23008 if (SDValue R = foldCSELOfCSEL(N, DAG))
23009 return R;
23010
23011 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23012 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23013 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23014 return Folded;
23015
23016 return performCONDCombine(N, DCI, DAG, 2, 3);
23017}
23018
23019// Try to re-use an already extended operand of a vector SetCC feeding a
23020// extended select. Doing so avoids requiring another full extension of the
23021// SET_CC result when lowering the select.
23023 EVT Op0MVT = Op->getOperand(0).getValueType();
23024 if (!Op0MVT.isVector() || Op->use_empty())
23025 return SDValue();
23026
23027 // Make sure that all uses of Op are VSELECTs with result matching types where
23028 // the result type has a larger element type than the SetCC operand.
23029 SDNode *FirstUse = *Op->use_begin();
23030 if (FirstUse->getOpcode() != ISD::VSELECT)
23031 return SDValue();
23032 EVT UseMVT = FirstUse->getValueType(0);
23033 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23034 return SDValue();
23035 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23036 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23037 }))
23038 return SDValue();
23039
23040 APInt V;
23041 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23042 return SDValue();
23043
23044 SDLoc DL(Op);
23045 SDValue Op0ExtV;
23046 SDValue Op1ExtV;
23047 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23048 // Check if the first operand of the SET_CC is already extended. If it is,
23049 // split the SET_CC and re-use the extended version of the operand.
23050 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23051 Op->getOperand(0));
23052 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23053 Op->getOperand(0));
23054 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23055 Op0ExtV = SDValue(Op0SExt, 0);
23056 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23057 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23058 Op0ExtV = SDValue(Op0ZExt, 0);
23059 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23060 } else
23061 return SDValue();
23062
23063 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23064 Op0ExtV, Op1ExtV, Op->getOperand(2));
23065}
23066
23067static SDValue
23069 SelectionDAG &DAG) {
23070 SDValue Vec = N->getOperand(0);
23071 if (DCI.isBeforeLegalize() &&
23072 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23075 SDLoc DL(N);
23076 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23077 DAG);
23078 }
23079
23080 return SDValue();
23081}
23082
23085 SelectionDAG &DAG) {
23086 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23087 SDValue LHS = N->getOperand(0);
23088 SDValue RHS = N->getOperand(1);
23089 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23090 SDLoc DL(N);
23091 EVT VT = N->getValueType(0);
23092
23093 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23094 return V;
23095
23096 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23097 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23098 LHS->getOpcode() == AArch64ISD::CSEL &&
23099 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23100 LHS->hasOneUse()) {
23101 // Invert CSEL's condition.
23102 auto OldCond =
23103 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23104 auto NewCond = getInvertedCondCode(OldCond);
23105
23106 // csel 0, 1, !cond, X
23107 SDValue CSEL =
23108 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23109 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23110 LHS.getOperand(3));
23111 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23112 }
23113
23114 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23115 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23116 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23117 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23118 LHS->hasOneUse()) {
23119 EVT TstVT = LHS->getValueType(0);
23120 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23121 // this pattern will get better opt in emitComparison
23122 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23123 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23124 DAG.getConstant(TstImm, DL, TstVT));
23125 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23126 }
23127 }
23128
23129 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23130 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23131 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23132 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23133 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23134 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23136 LHS->getOpcode() == ISD::BITCAST) {
23137 EVT ToVT = LHS->getValueType(0);
23138 EVT FromVT = LHS->getOperand(0).getValueType();
23139 if (FromVT.isFixedLengthVector() &&
23140 FromVT.getVectorElementType() == MVT::i1) {
23141 bool IsNull = isNullConstant(RHS);
23143 DL, MVT::i1, LHS->getOperand(0));
23144 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23145 LHS);
23146 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23147 }
23148 }
23149
23150 // Try to perform the memcmp when the result is tested for [in]equality with 0
23151 if (SDValue V = performOrXorChainCombine(N, DAG))
23152 return V;
23153
23154 return SDValue();
23155}
23156
23157// Replace a flag-setting operator (eg ANDS) with the generic version
23158// (eg AND) if the flag is unused.
23161 unsigned GenericOpcode) {
23162 SDLoc DL(N);
23163 SDValue LHS = N->getOperand(0);
23164 SDValue RHS = N->getOperand(1);
23165 EVT VT = N->getValueType(0);
23166
23167 // If the flag result isn't used, convert back to a generic opcode.
23168 if (!N->hasAnyUseOfValue(1)) {
23169 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23170 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23171 DL);
23172 }
23173
23174 // Combine identical generic nodes into this node, re-using the result.
23175 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23176 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23177 DCI.CombineTo(Generic, SDValue(N, 0));
23178
23179 return SDValue();
23180}
23181
23183 // setcc_merge_zero pred
23184 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23185 // => extract_subvector (inner setcc_merge_zero)
23186 SDValue Pred = N->getOperand(0);
23187 SDValue LHS = N->getOperand(1);
23188 SDValue RHS = N->getOperand(2);
23189 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23190
23191 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23192 LHS->getOpcode() != ISD::SIGN_EXTEND)
23193 return SDValue();
23194
23195 SDValue Extract = LHS->getOperand(0);
23196 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23197 Extract->getValueType(0) != N->getValueType(0) ||
23198 Extract->getConstantOperandVal(1) != 0)
23199 return SDValue();
23200
23201 SDValue InnerSetCC = Extract->getOperand(0);
23202 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23203 return SDValue();
23204
23205 // By this point we've effectively got
23206 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23207 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23208 // can operate on A directly.
23209 SDValue InnerPred = InnerSetCC.getOperand(0);
23210 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23211 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23212 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23213 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23214 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23215 return Extract;
23216
23217 return SDValue();
23218}
23219
23220static SDValue
23222 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23223 "Unexpected opcode!");
23224
23225 SelectionDAG &DAG = DCI.DAG;
23226 SDValue Pred = N->getOperand(0);
23227 SDValue LHS = N->getOperand(1);
23228 SDValue RHS = N->getOperand(2);
23229 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23230
23231 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23232 return V;
23233
23234 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23235 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23236 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23237 // setcc_merge_zero(
23238 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23239 // => setcc_merge_zero(pred, ...)
23240 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23241 LHS->getOperand(0)->getOperand(0) == Pred)
23242 return LHS->getOperand(0);
23243
23244 // setcc_merge_zero(
23245 // all_active, extend(nxvNi1 ...), != splat(0))
23246 // -> nxvNi1 ...
23247 if (isAllActivePredicate(DAG, Pred))
23248 return LHS->getOperand(0);
23249
23250 // setcc_merge_zero(
23251 // pred, extend(nxvNi1 ...), != splat(0))
23252 // -> nxvNi1 and(pred, ...)
23253 if (DCI.isAfterLegalizeDAG())
23254 // Do this after legalization to allow more folds on setcc_merge_zero
23255 // to be recognized.
23256 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23257 LHS->getOperand(0), Pred);
23258 }
23259
23260 return SDValue();
23261}
23262
23263// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23264// as well as whether the test should be inverted. This code is required to
23265// catch these cases (as opposed to standard dag combines) because
23266// AArch64ISD::TBZ is matched during legalization.
23267static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23268 SelectionDAG &DAG) {
23269
23270 if (!Op->hasOneUse())
23271 return Op;
23272
23273 // We don't handle undef/constant-fold cases below, as they should have
23274 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23275 // etc.)
23276
23277 // (tbz (trunc x), b) -> (tbz x, b)
23278 // This case is just here to enable more of the below cases to be caught.
23279 if (Op->getOpcode() == ISD::TRUNCATE &&
23280 Bit < Op->getValueType(0).getSizeInBits()) {
23281 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23282 }
23283
23284 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23285 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23286 Bit < Op->getOperand(0).getValueSizeInBits()) {
23287 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23288 }
23289
23290 if (Op->getNumOperands() != 2)
23291 return Op;
23292
23293 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23294 if (!C)
23295 return Op;
23296
23297 switch (Op->getOpcode()) {
23298 default:
23299 return Op;
23300
23301 // (tbz (and x, m), b) -> (tbz x, b)
23302 case ISD::AND:
23303 if ((C->getZExtValue() >> Bit) & 1)
23304 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23305 return Op;
23306
23307 // (tbz (shl x, c), b) -> (tbz x, b-c)
23308 case ISD::SHL:
23309 if (C->getZExtValue() <= Bit &&
23310 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23311 Bit = Bit - C->getZExtValue();
23312 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23313 }
23314 return Op;
23315
23316 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23317 case ISD::SRA:
23318 Bit = Bit + C->getZExtValue();
23319 if (Bit >= Op->getValueType(0).getSizeInBits())
23320 Bit = Op->getValueType(0).getSizeInBits() - 1;
23321 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23322
23323 // (tbz (srl x, c), b) -> (tbz x, b+c)
23324 case ISD::SRL:
23325 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23326 Bit = Bit + C->getZExtValue();
23327 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23328 }
23329 return Op;
23330
23331 // (tbz (xor x, -1), b) -> (tbnz x, b)
23332 case ISD::XOR:
23333 if ((C->getZExtValue() >> Bit) & 1)
23334 Invert = !Invert;
23335 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23336 }
23337}
23338
23339// Optimize test single bit zero/non-zero and branch.
23342 SelectionDAG &DAG) {
23343 unsigned Bit = N->getConstantOperandVal(2);
23344 bool Invert = false;
23345 SDValue TestSrc = N->getOperand(1);
23346 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23347
23348 if (TestSrc == NewTestSrc)
23349 return SDValue();
23350
23351 unsigned NewOpc = N->getOpcode();
23352 if (Invert) {
23353 if (NewOpc == AArch64ISD::TBZ)
23354 NewOpc = AArch64ISD::TBNZ;
23355 else {
23356 assert(NewOpc == AArch64ISD::TBNZ);
23357 NewOpc = AArch64ISD::TBZ;
23358 }
23359 }
23360
23361 SDLoc DL(N);
23362 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23363 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23364}
23365
23366// Swap vselect operands where it may allow a predicated operation to achieve
23367// the `sel`.
23368//
23369// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23370// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23372 auto SelectA = N->getOperand(1);
23373 auto SelectB = N->getOperand(2);
23374 auto NTy = N->getValueType(0);
23375
23376 if (!NTy.isScalableVector())
23377 return SDValue();
23378 SDValue SetCC = N->getOperand(0);
23379 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23380 return SDValue();
23381
23382 switch (SelectB.getOpcode()) {
23383 default:
23384 return SDValue();
23385 case ISD::FMUL:
23386 case ISD::FSUB:
23387 case ISD::FADD:
23388 break;
23389 }
23390 if (SelectA != SelectB.getOperand(0))
23391 return SDValue();
23392
23393 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23394 ISD::CondCode InverseCC =
23396 auto InverseSetCC =
23397 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23398 SetCC.getOperand(1), InverseCC);
23399
23400 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23401 {InverseSetCC, SelectB, SelectA});
23402}
23403
23404// vselect (v1i1 setcc) ->
23405// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23406// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23407// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23408// such VSELECT.
23410 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23411 return SwapResult;
23412
23413 SDValue N0 = N->getOperand(0);
23414 EVT CCVT = N0.getValueType();
23415
23416 if (isAllActivePredicate(DAG, N0))
23417 return N->getOperand(1);
23418
23419 if (isAllInactivePredicate(N0))
23420 return N->getOperand(2);
23421
23422 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23423 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23424 // supported types.
23425 SDValue SetCC = N->getOperand(0);
23426 if (SetCC.getOpcode() == ISD::SETCC &&
23427 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23428 SDValue CmpLHS = SetCC.getOperand(0);
23429 EVT VT = CmpLHS.getValueType();
23430 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23431 SDNode *SplatLHS = N->getOperand(1).getNode();
23432 SDNode *SplatRHS = N->getOperand(2).getNode();
23433 APInt SplatLHSVal;
23434 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23435 VT.isSimple() &&
23436 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23437 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23438 VT.getSimpleVT().SimpleTy) &&
23439 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23440 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23442 unsigned NumElts = VT.getVectorNumElements();
23444 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23445 VT.getScalarType()));
23446 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23447
23448 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23449 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23450 return Or;
23451 }
23452 }
23453
23454 EVT CmpVT = N0.getOperand(0).getValueType();
23455 if (N0.getOpcode() != ISD::SETCC ||
23457 CCVT.getVectorElementType() != MVT::i1 ||
23459 return SDValue();
23460
23461 EVT ResVT = N->getValueType(0);
23462 // Only combine when the result type is of the same size as the compared
23463 // operands.
23464 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23465 return SDValue();
23466
23467 SDValue IfTrue = N->getOperand(1);
23468 SDValue IfFalse = N->getOperand(2);
23469 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23470 N0.getOperand(0), N0.getOperand(1),
23471 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23472 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23473 IfTrue, IfFalse);
23474}
23475
23476/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23477/// the compare-mask instructions rather than going via NZCV, even if LHS and
23478/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23479/// with a vector one followed by a DUP shuffle on the result.
23482 SelectionDAG &DAG = DCI.DAG;
23483 SDValue N0 = N->getOperand(0);
23484 EVT ResVT = N->getValueType(0);
23485
23486 if (N0.getOpcode() != ISD::SETCC)
23487 return SDValue();
23488
23489 if (ResVT.isScalableVT())
23490 return SDValue();
23491
23492 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23493 // scalar SetCCResultType. We also don't expect vectors, because we assume
23494 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23495 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23496 "Scalar-SETCC feeding SELECT has unexpected result type!");
23497
23498 // If NumMaskElts == 0, the comparison is larger than select result. The
23499 // largest real NEON comparison is 64-bits per lane, which means the result is
23500 // at most 32-bits and an illegal vector. Just bail out for now.
23501 EVT SrcVT = N0.getOperand(0).getValueType();
23502
23503 // Don't try to do this optimization when the setcc itself has i1 operands.
23504 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23505 // ruled out to prevent the creation of setcc that need to be scalarized.
23506 if (SrcVT == MVT::i1 ||
23507 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23508 return SDValue();
23509
23510 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23511 if (!ResVT.isVector() || NumMaskElts == 0)
23512 return SDValue();
23513
23514 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23516
23517 // Also bail out if the vector CCVT isn't the same size as ResVT.
23518 // This can happen if the SETCC operand size doesn't divide the ResVT size
23519 // (e.g., f64 vs v3f32).
23520 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23521 return SDValue();
23522
23523 // Make sure we didn't create illegal types, if we're not supposed to.
23524 assert(DCI.isBeforeLegalize() ||
23525 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23526
23527 // First perform a vector comparison, where lane 0 is the one we're interested
23528 // in.
23529 SDLoc DL(N0);
23530 SDValue LHS =
23531 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23532 SDValue RHS =
23533 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23534 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23535
23536 // Now duplicate the comparison mask we want across all other lanes.
23537 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23538 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23539 Mask = DAG.getNode(ISD::BITCAST, DL,
23540 ResVT.changeVectorElementTypeToInteger(), Mask);
23541
23542 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23543}
23544
23547 EVT VT = N->getValueType(0);
23548 SDLoc DL(N);
23549 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23550 // 128bit vector version.
23551 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23553 SmallVector<SDValue> Ops(N->ops());
23554 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23555 DCI.DAG.getVTList(LVT), Ops)) {
23556 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23557 DCI.DAG.getConstant(0, DL, MVT::i64));
23558 }
23559 }
23560
23561 if (N->getOpcode() == AArch64ISD::DUP) {
23562 if (DCI.isAfterLegalizeDAG()) {
23563 // If scalar dup's operand is extract_vector_elt, try to combine them into
23564 // duplane. For example,
23565 //
23566 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23567 // t18: v4i32 = AArch64ISD::DUP t21
23568 // ==>
23569 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23570 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23571 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23572 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23573 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23574 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23575 EXTRACT_VEC_ELT.getOperand(1));
23576 }
23577 }
23578 }
23579
23580 return performPostLD1Combine(N, DCI, false);
23581 }
23582
23583 return SDValue();
23584}
23585
23586/// Get rid of unnecessary NVCASTs (that don't change the type).
23588 if (N->getValueType(0) == N->getOperand(0).getValueType())
23589 return N->getOperand(0);
23590 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23591 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23592 N->getOperand(0).getOperand(0));
23593
23594 return SDValue();
23595}
23596
23597// If all users of the globaladdr are of the form (globaladdr + constant), find
23598// the smallest constant, fold it into the globaladdr's offset and rewrite the
23599// globaladdr as (globaladdr + constant) - constant.
23601 const AArch64Subtarget *Subtarget,
23602 const TargetMachine &TM) {
23603 auto *GN = cast<GlobalAddressSDNode>(N);
23604 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23606 return SDValue();
23607
23608 uint64_t MinOffset = -1ull;
23609 for (SDNode *N : GN->uses()) {
23610 if (N->getOpcode() != ISD::ADD)
23611 return SDValue();
23612 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23613 if (!C)
23614 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23615 if (!C)
23616 return SDValue();
23617 MinOffset = std::min(MinOffset, C->getZExtValue());
23618 }
23619 uint64_t Offset = MinOffset + GN->getOffset();
23620
23621 // Require that the new offset is larger than the existing one. Otherwise, we
23622 // can end up oscillating between two possible DAGs, for example,
23623 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23624 if (Offset <= uint64_t(GN->getOffset()))
23625 return SDValue();
23626
23627 // Check whether folding this offset is legal. It must not go out of bounds of
23628 // the referenced object to avoid violating the code model, and must be
23629 // smaller than 2^20 because this is the largest offset expressible in all
23630 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23631 // stores an immediate signed 21 bit offset.)
23632 //
23633 // This check also prevents us from folding negative offsets, which will end
23634 // up being treated in the same way as large positive ones. They could also
23635 // cause code model violations, and aren't really common enough to matter.
23636 if (Offset >= (1 << 20))
23637 return SDValue();
23638
23639 const GlobalValue *GV = GN->getGlobal();
23640 Type *T = GV->getValueType();
23641 if (!T->isSized() ||
23643 return SDValue();
23644
23645 SDLoc DL(GN);
23646 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23647 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23648 DAG.getConstant(MinOffset, DL, MVT::i64));
23649}
23650
23652 const AArch64Subtarget *Subtarget) {
23653 SDValue BR = N->getOperand(0);
23654 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23655 !BR.getValueType().isScalarInteger())
23656 return SDValue();
23657
23658 SDLoc DL(N);
23659 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23660}
23661
23662// Turns the vector of indices into a vector of byte offstes by scaling Offset
23663// by (BitWidth / 8).
23665 SDLoc DL, unsigned BitWidth) {
23666 assert(Offset.getValueType().isScalableVector() &&
23667 "This method is only for scalable vectors of offsets");
23668
23669 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23670 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23671
23672 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23673}
23674
23675/// Check if the value of \p OffsetInBytes can be used as an immediate for
23676/// the gather load/prefetch and scatter store instructions with vector base and
23677/// immediate offset addressing mode:
23678///
23679/// [<Zn>.[S|D]{, #<imm>}]
23680///
23681/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23682inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23683 unsigned ScalarSizeInBytes) {
23684 // The immediate is not a multiple of the scalar size.
23685 if (OffsetInBytes % ScalarSizeInBytes)
23686 return false;
23687
23688 // The immediate is out of range.
23689 if (OffsetInBytes / ScalarSizeInBytes > 31)
23690 return false;
23691
23692 return true;
23693}
23694
23695/// Check if the value of \p Offset represents a valid immediate for the SVE
23696/// gather load/prefetch and scatter store instructiona with vector base and
23697/// immediate offset addressing mode:
23698///
23699/// [<Zn>.[S|D]{, #<imm>}]
23700///
23701/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23703 unsigned ScalarSizeInBytes) {
23704 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23705 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23706 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23707}
23708
23710 unsigned Opcode,
23711 bool OnlyPackedOffsets = true) {
23712 const SDValue Src = N->getOperand(2);
23713 const EVT SrcVT = Src->getValueType(0);
23714 assert(SrcVT.isScalableVector() &&
23715 "Scatter stores are only possible for SVE vectors");
23716
23717 SDLoc DL(N);
23718 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23719
23720 // Make sure that source data will fit into an SVE register
23722 return SDValue();
23723
23724 // For FPs, ACLE only supports _packed_ single and double precision types.
23725 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23726 if (SrcElVT.isFloatingPoint())
23727 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23728 ((Opcode != AArch64ISD::SST1Q_PRED &&
23729 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23730 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23731 return SDValue();
23732
23733 // Depending on the addressing mode, this is either a pointer or a vector of
23734 // pointers (that fits into one register)
23735 SDValue Base = N->getOperand(4);
23736 // Depending on the addressing mode, this is either a single offset or a
23737 // vector of offsets (that fits into one register)
23738 SDValue Offset = N->getOperand(5);
23739
23740 // For "scalar + vector of indices", just scale the indices. This only
23741 // applies to non-temporal scatters because there's no instruction that takes
23742 // indicies.
23743 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23744 Offset =
23746 Opcode = AArch64ISD::SSTNT1_PRED;
23747 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23748 Offset =
23750 Opcode = AArch64ISD::SST1Q_PRED;
23751 }
23752
23753 // In the case of non-temporal gather loads there's only one SVE instruction
23754 // per data-size: "scalar + vector", i.e.
23755 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23756 // Since we do have intrinsics that allow the arguments to be in a different
23757 // order, we may need to swap them to match the spec.
23758 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23759 Offset.getValueType().isVector())
23761
23762 // SST1_IMM requires that the offset is an immediate that is:
23763 // * a multiple of #SizeInBytes,
23764 // * in the range [0, 31 x #SizeInBytes],
23765 // where #SizeInBytes is the size in bytes of the stored items. For
23766 // immediates outside that range and non-immediate scalar offsets use SST1 or
23767 // SST1_UXTW instead.
23768 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23770 SrcVT.getScalarSizeInBits() / 8)) {
23771 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23773 else
23774 Opcode = AArch64ISD::SST1_PRED;
23775
23777 }
23778 }
23779
23780 auto &TLI = DAG.getTargetLoweringInfo();
23781 if (!TLI.isTypeLegal(Base.getValueType()))
23782 return SDValue();
23783
23784 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23785 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23786 // nxv2i64. Legalize accordingly.
23787 if (!OnlyPackedOffsets &&
23788 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23789 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23790
23791 if (!TLI.isTypeLegal(Offset.getValueType()))
23792 return SDValue();
23793
23794 // Source value type that is representable in hardware
23795 EVT HwSrcVt = getSVEContainerType(SrcVT);
23796
23797 // Keep the original type of the input data to store - this is needed to be
23798 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23799 // FP values we want the integer equivalent, so just use HwSrcVt.
23800 SDValue InputVT = DAG.getValueType(SrcVT);
23801 if (SrcVT.isFloatingPoint())
23802 InputVT = DAG.getValueType(HwSrcVt);
23803
23804 SDVTList VTs = DAG.getVTList(MVT::Other);
23805 SDValue SrcNew;
23806
23807 if (Src.getValueType().isFloatingPoint())
23808 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23809 else
23810 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23811
23812 SDValue Ops[] = {N->getOperand(0), // Chain
23813 SrcNew,
23814 N->getOperand(3), // Pg
23815 Base,
23816 Offset,
23817 InputVT};
23818
23819 return DAG.getNode(Opcode, DL, VTs, Ops);
23820}
23821
23823 unsigned Opcode,
23824 bool OnlyPackedOffsets = true) {
23825 const EVT RetVT = N->getValueType(0);
23826 assert(RetVT.isScalableVector() &&
23827 "Gather loads are only possible for SVE vectors");
23828
23829 SDLoc DL(N);
23830
23831 // Make sure that the loaded data will fit into an SVE register
23833 return SDValue();
23834
23835 // Depending on the addressing mode, this is either a pointer or a vector of
23836 // pointers (that fits into one register)
23837 SDValue Base = N->getOperand(3);
23838 // Depending on the addressing mode, this is either a single offset or a
23839 // vector of offsets (that fits into one register)
23840 SDValue Offset = N->getOperand(4);
23841
23842 // For "scalar + vector of indices", scale the indices to obtain unscaled
23843 // offsets. This applies to non-temporal and quadword gathers, which do not
23844 // have an addressing mode with scaled offset.
23847 RetVT.getScalarSizeInBits());
23849 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23851 RetVT.getScalarSizeInBits());
23853 }
23854
23855 // In the case of non-temporal gather loads and quadword gather loads there's
23856 // only one addressing mode : "vector + scalar", e.g.
23857 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23858 // Since we do have intrinsics that allow the arguments to be in a different
23859 // order, we may need to swap them to match the spec.
23860 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23861 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23862 Offset.getValueType().isVector())
23864
23865 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23866 // * a multiple of #SizeInBytes,
23867 // * in the range [0, 31 x #SizeInBytes],
23868 // where #SizeInBytes is the size in bytes of the loaded items. For
23869 // immediates outside that range and non-immediate scalar offsets use
23870 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23871 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23874 RetVT.getScalarSizeInBits() / 8)) {
23875 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23876 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23879 else
23880 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23883
23885 }
23886 }
23887
23888 auto &TLI = DAG.getTargetLoweringInfo();
23889 if (!TLI.isTypeLegal(Base.getValueType()))
23890 return SDValue();
23891
23892 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23893 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23894 // nxv2i64. Legalize accordingly.
23895 if (!OnlyPackedOffsets &&
23896 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23897 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23898
23899 // Return value type that is representable in hardware
23900 EVT HwRetVt = getSVEContainerType(RetVT);
23901
23902 // Keep the original output value type around - this is needed to be able to
23903 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23904 // values we want the integer equivalent, so just use HwRetVT.
23905 SDValue OutVT = DAG.getValueType(RetVT);
23906 if (RetVT.isFloatingPoint())
23907 OutVT = DAG.getValueType(HwRetVt);
23908
23909 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23910 SDValue Ops[] = {N->getOperand(0), // Chain
23911 N->getOperand(2), // Pg
23912 Base, Offset, OutVT};
23913
23914 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23915 SDValue LoadChain = SDValue(Load.getNode(), 1);
23916
23917 if (RetVT.isInteger() && (RetVT != HwRetVt))
23918 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
23919
23920 // If the original return value was FP, bitcast accordingly. Doing it here
23921 // means that we can avoid adding TableGen patterns for FPs.
23922 if (RetVT.isFloatingPoint())
23923 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
23924
23925 return DAG.getMergeValues({Load, LoadChain}, DL);
23926}
23927
23928static SDValue
23930 SelectionDAG &DAG) {
23931 SDLoc DL(N);
23932 SDValue Src = N->getOperand(0);
23933 unsigned Opc = Src->getOpcode();
23934
23935 // Sign extend of an unsigned unpack -> signed unpack
23936 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
23937
23938 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
23940
23941 // Push the sign extend to the operand of the unpack
23942 // This is necessary where, for example, the operand of the unpack
23943 // is another unpack:
23944 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
23945 // ->
23946 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
23947 // ->
23948 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
23949 SDValue ExtOp = Src->getOperand(0);
23950 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
23951 EVT EltTy = VT.getVectorElementType();
23952 (void)EltTy;
23953
23954 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
23955 "Sign extending from an invalid type");
23956
23957 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
23958
23960 ExtOp, DAG.getValueType(ExtVT));
23961
23962 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
23963 }
23964
23965 if (DCI.isBeforeLegalizeOps())
23966 return SDValue();
23967
23969 return SDValue();
23970
23971 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
23972 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
23973 unsigned NewOpc;
23974 unsigned MemVTOpNum = 4;
23975 switch (Opc) {
23978 MemVTOpNum = 3;
23979 break;
23982 MemVTOpNum = 3;
23983 break;
23986 MemVTOpNum = 3;
23987 break;
23990 break;
23993 break;
23996 break;
23999 break;
24002 break;
24005 break;
24008 break;
24011 break;
24014 break;
24017 break;
24020 break;
24023 break;
24026 break;
24029 break;
24032 break;
24033 default:
24034 return SDValue();
24035 }
24036
24037 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24038 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24039
24040 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24041 return SDValue();
24042
24043 EVT DstVT = N->getValueType(0);
24044 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24045
24047 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24048 Ops.push_back(Src->getOperand(I));
24049
24050 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24051 DCI.CombineTo(N, ExtLoad);
24052 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24053
24054 // Return N so it doesn't get rechecked
24055 return SDValue(N, 0);
24056}
24057
24058/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24059/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24060/// != nxv2i32) do not need legalization.
24062 const unsigned OffsetPos = 4;
24063 SDValue Offset = N->getOperand(OffsetPos);
24064
24065 // Not an unpacked vector, bail out.
24066 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24067 return SDValue();
24068
24069 // Extend the unpacked offset vector to 64-bit lanes.
24070 SDLoc DL(N);
24071 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24072 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24073 // Replace the offset operand with the 64-bit one.
24074 Ops[OffsetPos] = Offset;
24075
24076 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24077}
24078
24079/// Combines a node carrying the intrinsic
24080/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24081/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24082/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24083/// sve gather prefetch instruction with vector plus immediate addressing mode.
24085 unsigned ScalarSizeInBytes) {
24086 const unsigned ImmPos = 4, OffsetPos = 3;
24087 // No need to combine the node if the immediate is valid...
24088 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24089 return SDValue();
24090
24091 // ...otherwise swap the offset base with the offset...
24092 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24093 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24094 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24095 // `aarch64_sve_prfb_gather_uxtw_index`.
24096 SDLoc DL(N);
24097 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24098 MVT::i64);
24099
24100 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24101}
24102
24103// Return true if the vector operation can guarantee only the first lane of its
24104// result contains data, with all bits in other lanes set to zero.
24106 switch (Op.getOpcode()) {
24107 default:
24108 return false;
24124 return true;
24125 }
24126}
24127
24129 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24130 SDValue InsertVec = N->getOperand(0);
24131 SDValue InsertElt = N->getOperand(1);
24132 SDValue InsertIdx = N->getOperand(2);
24133
24134 // We only care about inserts into the first element...
24135 if (!isNullConstant(InsertIdx))
24136 return SDValue();
24137 // ...of a zero'd vector...
24139 return SDValue();
24140 // ...where the inserted data was previously extracted...
24141 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24142 return SDValue();
24143
24144 SDValue ExtractVec = InsertElt.getOperand(0);
24145 SDValue ExtractIdx = InsertElt.getOperand(1);
24146
24147 // ...from the first element of a vector.
24148 if (!isNullConstant(ExtractIdx))
24149 return SDValue();
24150
24151 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24152
24153 // Ensure there's no type conversion going on.
24154 if (N->getValueType(0) != ExtractVec.getValueType())
24155 return SDValue();
24156
24157 if (!isLanes1toNKnownZero(ExtractVec))
24158 return SDValue();
24159
24160 // The explicit zeroing is redundant.
24161 return ExtractVec;
24162}
24163
24164static SDValue
24167 return Res;
24168
24169 return performPostLD1Combine(N, DCI, true);
24170}
24171
24173 EVT Ty = N->getValueType(0);
24174 if (Ty.isInteger())
24175 return SDValue();
24176
24179 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24181 return SDValue();
24182
24183 SDLoc DL(N);
24184 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
24185 DL, ExtIntTy);
24186 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
24187 DL, ExtIntTy);
24188 SDValue Idx = N->getOperand(2);
24189 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
24190 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
24191 return DAG.getBitcast(Ty, Trunc);
24192}
24193
24196 const AArch64Subtarget *Subtarget) {
24197 SDValue N0 = N->getOperand(0);
24198 EVT VT = N->getValueType(0);
24199
24200 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24201 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24202 return SDValue();
24203
24204 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24205 EVT EltVT = VT.getVectorElementType();
24206 return EltVT == MVT::f32 || EltVT == MVT::f64;
24207 };
24208
24209 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24210 // We purposefully don't care about legality of the nodes here as we know
24211 // they can be split down into something legal.
24212 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24213 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24214 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24215 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24216 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24217 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24218 LN0->getChain(), LN0->getBasePtr(),
24219 N0.getValueType(), LN0->getMemOperand());
24220 DCI.CombineTo(N, ExtLoad);
24221 DCI.CombineTo(
24222 N0.getNode(),
24223 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24224 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24225 ExtLoad.getValue(1));
24226 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24227 }
24228
24229 return SDValue();
24230}
24231
24233 const AArch64Subtarget *Subtarget) {
24234 EVT VT = N->getValueType(0);
24235
24236 // Don't expand for NEON, SVE2 or SME
24237 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24238 return SDValue();
24239
24240 SDLoc DL(N);
24241
24242 SDValue Mask = N->getOperand(0);
24243 SDValue In1 = N->getOperand(1);
24244 SDValue In2 = N->getOperand(2);
24245
24246 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24247 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24248 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24249 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24250}
24251
24253 EVT VT = N->getValueType(0);
24254
24255 SDValue Insert = N->getOperand(0);
24256 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24257 return SDValue();
24258
24259 if (!Insert.getOperand(0).isUndef())
24260 return SDValue();
24261
24262 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24263 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24264 if (IdxInsert != 0 || IdxDupLane != 0)
24265 return SDValue();
24266
24267 SDValue Bitcast = Insert.getOperand(1);
24268 if (Bitcast.getOpcode() != ISD::BITCAST)
24269 return SDValue();
24270
24271 SDValue Subvec = Bitcast.getOperand(0);
24272 EVT SubvecVT = Subvec.getValueType();
24273 if (!SubvecVT.is128BitVector())
24274 return SDValue();
24275 EVT NewSubvecVT =
24277
24278 SDLoc DL(N);
24279 SDValue NewInsert =
24280 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24281 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24282 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24283 NewInsert, N->getOperand(1));
24284 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24285}
24286
24287// Try to combine mull with uzp1.
24290 SelectionDAG &DAG) {
24291 if (DCI.isBeforeLegalizeOps())
24292 return SDValue();
24293
24294 SDValue LHS = N->getOperand(0);
24295 SDValue RHS = N->getOperand(1);
24296
24297 SDValue ExtractHigh;
24298 SDValue ExtractLow;
24299 SDValue TruncHigh;
24300 SDValue TruncLow;
24301 SDLoc DL(N);
24302
24303 // Check the operands are trunc and extract_high.
24305 RHS.getOpcode() == ISD::TRUNCATE) {
24306 TruncHigh = RHS;
24307 if (LHS.getOpcode() == ISD::BITCAST)
24308 ExtractHigh = LHS.getOperand(0);
24309 else
24310 ExtractHigh = LHS;
24312 LHS.getOpcode() == ISD::TRUNCATE) {
24313 TruncHigh = LHS;
24314 if (LHS.getOpcode() == ISD::BITCAST)
24315 ExtractHigh = RHS.getOperand(0);
24316 else
24317 ExtractHigh = RHS;
24318 } else
24319 return SDValue();
24320
24321 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24322 // with uzp1.
24323 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24324 SDValue TruncHighOp = TruncHigh.getOperand(0);
24325 EVT TruncHighOpVT = TruncHighOp.getValueType();
24326 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24327 DAG.isSplatValue(TruncHighOp, false))
24328 return SDValue();
24329
24330 // Check there is other extract_high with same source vector.
24331 // For example,
24332 //
24333 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24334 // t12: v4i16 = truncate t11
24335 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24336 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24337 // t16: v4i16 = truncate t15
24338 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24339 //
24340 // This dagcombine assumes the two extract_high uses same source vector in
24341 // order to detect the pair of the mull. If they have different source vector,
24342 // this code will not work.
24343 bool HasFoundMULLow = true;
24344 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24345 if (ExtractHighSrcVec->use_size() != 2)
24346 HasFoundMULLow = false;
24347
24348 // Find ExtractLow.
24349 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24350 if (User == ExtractHigh.getNode())
24351 continue;
24352
24353 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24355 HasFoundMULLow = false;
24356 break;
24357 }
24358
24359 ExtractLow.setNode(User);
24360 }
24361
24362 if (!ExtractLow || !ExtractLow->hasOneUse())
24363 HasFoundMULLow = false;
24364
24365 // Check ExtractLow's user.
24366 if (HasFoundMULLow) {
24367 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24368 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24369 HasFoundMULLow = false;
24370 } else {
24371 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24372 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24373 TruncLow = ExtractLowUser->getOperand(1);
24374 else
24375 HasFoundMULLow = false;
24376 } else {
24377 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24378 TruncLow = ExtractLowUser->getOperand(0);
24379 else
24380 HasFoundMULLow = false;
24381 }
24382 }
24383 }
24384
24385 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24386 // with uzp1.
24387 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24388 EVT TruncHighVT = TruncHigh.getValueType();
24389 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24390 SDValue TruncLowOp =
24391 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24392 EVT TruncLowOpVT = TruncLowOp.getValueType();
24393 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24394 DAG.isSplatValue(TruncLowOp, false)))
24395 return SDValue();
24396
24397 // Create uzp1, extract_high and extract_low.
24398 if (TruncHighOpVT != UZP1VT)
24399 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24400 if (TruncLowOpVT != UZP1VT)
24401 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24402
24403 SDValue UZP1 =
24404 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24405 SDValue HighIdxCst =
24406 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24407 SDValue NewTruncHigh =
24408 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24409 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24410
24411 if (HasFoundMULLow) {
24412 EVT TruncLowVT = TruncLow.getValueType();
24413 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24414 UZP1, ExtractLow.getOperand(1));
24415 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24416 }
24417
24418 return SDValue(N, 0);
24419}
24420
24423 SelectionDAG &DAG) {
24424 if (SDValue Val =
24426 return Val;
24427
24428 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24429 return Val;
24430
24431 return SDValue();
24432}
24433
24434static SDValue
24436 SelectionDAG &DAG) {
24437 // Let's do below transform.
24438 //
24439 // t34: v4i32 = AArch64ISD::UADDLV t2
24440 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24441 // t7: i64 = zero_extend t35
24442 // t20: v1i64 = scalar_to_vector t7
24443 // ==>
24444 // t34: v4i32 = AArch64ISD::UADDLV t2
24445 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24446 // t40: v1i64 = AArch64ISD::NVCAST t39
24447 if (DCI.isBeforeLegalizeOps())
24448 return SDValue();
24449
24450 EVT VT = N->getValueType(0);
24451 if (VT != MVT::v1i64)
24452 return SDValue();
24453
24454 SDValue ZEXT = N->getOperand(0);
24455 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24456 return SDValue();
24457
24458 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24459 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24460 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24461 return SDValue();
24462
24463 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24464 return SDValue();
24465
24466 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24467 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24468 UADDLV.getValueType() != MVT::v4i32 ||
24469 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24470 return SDValue();
24471
24472 // Let's generate new sequence with AArch64ISD::NVCAST.
24473 SDLoc DL(N);
24474 SDValue EXTRACT_SUBVEC =
24475 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24476 DAG.getConstant(0, DL, MVT::i64));
24477 SDValue NVCAST =
24478 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24479
24480 return NVCAST;
24481}
24482
24484 DAGCombinerInfo &DCI) const {
24485 SelectionDAG &DAG = DCI.DAG;
24486 switch (N->getOpcode()) {
24487 default:
24488 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24489 break;
24490 case ISD::VECREDUCE_AND:
24491 case ISD::VECREDUCE_OR:
24492 case ISD::VECREDUCE_XOR:
24493 return performVecReduceBitwiseCombine(N, DCI, DAG);
24494 case ISD::ADD:
24495 case ISD::SUB:
24496 return performAddSubCombine(N, DCI);
24497 case ISD::BUILD_VECTOR:
24498 return performBuildVectorCombine(N, DCI, DAG);
24499 case ISD::TRUNCATE:
24500 return performTruncateCombine(N, DAG);
24501 case AArch64ISD::ANDS:
24502 return performFlagSettingCombine(N, DCI, ISD::AND);
24503 case AArch64ISD::ADC:
24504 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24505 return R;
24506 return foldADCToCINC(N, DAG);
24507 case AArch64ISD::SBC:
24508 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24509 case AArch64ISD::ADCS:
24510 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24511 return R;
24513 case AArch64ISD::SBCS:
24514 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24515 return R;
24517 case AArch64ISD::BICi: {
24519 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
24520 APInt DemandedElts =
24521 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
24522
24524 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
24525 return SDValue();
24526
24527 break;
24528 }
24529 case ISD::XOR:
24530 return performXorCombine(N, DAG, DCI, Subtarget);
24531 case ISD::MUL:
24532 return performMulCombine(N, DAG, DCI, Subtarget);
24533 case ISD::SINT_TO_FP:
24534 case ISD::UINT_TO_FP:
24535 return performIntToFpCombine(N, DAG, Subtarget);
24536 case ISD::FP_TO_SINT:
24537 case ISD::FP_TO_UINT:
24540 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24541 case ISD::FDIV:
24542 return performFDivCombine(N, DAG, DCI, Subtarget);
24543 case ISD::OR:
24544 return performORCombine(N, DCI, Subtarget, *this);
24545 case ISD::AND:
24546 return performANDCombine(N, DCI);
24547 case ISD::FADD:
24548 return performFADDCombine(N, DCI);
24550 return performIntrinsicCombine(N, DCI, Subtarget);
24551 case ISD::ANY_EXTEND:
24552 case ISD::ZERO_EXTEND:
24553 case ISD::SIGN_EXTEND:
24554 return performExtendCombine(N, DCI, DAG);
24556 return performSignExtendInRegCombine(N, DCI, DAG);
24558 return performConcatVectorsCombine(N, DCI, DAG);
24560 return performExtractSubvectorCombine(N, DCI, DAG);
24562 return performInsertSubvectorCombine(N, DCI, DAG);
24563 case ISD::SELECT:
24564 return performSelectCombine(N, DCI);
24565 case ISD::VSELECT:
24566 return performVSelectCombine(N, DCI.DAG);
24567 case ISD::SETCC:
24568 return performSETCCCombine(N, DCI, DAG);
24569 case ISD::LOAD:
24570 return performLOADCombine(N, DCI, DAG, Subtarget);
24571 case ISD::STORE:
24572 return performSTORECombine(N, DCI, DAG, Subtarget);
24573 case ISD::MSTORE:
24574 return performMSTORECombine(N, DCI, DAG, Subtarget);
24575 case ISD::MGATHER:
24576 case ISD::MSCATTER:
24577 return performMaskedGatherScatterCombine(N, DCI, DAG);
24578 case ISD::VECTOR_SPLICE:
24579 return performSVESpliceCombine(N, DAG);
24580 case ISD::FP_EXTEND:
24581 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24582 case AArch64ISD::BRCOND:
24583 return performBRCONDCombine(N, DCI, DAG);
24584 case AArch64ISD::TBNZ:
24585 case AArch64ISD::TBZ:
24586 return performTBZCombine(N, DCI, DAG);
24587 case AArch64ISD::CSEL:
24588 return performCSELCombine(N, DCI, DAG);
24589 case AArch64ISD::DUP:
24594 return performDUPCombine(N, DCI);
24596 return performDupLane128Combine(N, DAG);
24597 case AArch64ISD::NVCAST:
24598 return performNVCASTCombine(N, DAG);
24599 case AArch64ISD::SPLICE:
24600 return performSpliceCombine(N, DAG);
24603 return performUnpackCombine(N, DAG, Subtarget);
24604 case AArch64ISD::UZP1:
24605 return performUzpCombine(N, DAG, Subtarget);
24607 return performSetccMergeZeroCombine(N, DCI);
24624 return performGLD1Combine(N, DAG);
24625 case AArch64ISD::VASHR:
24626 case AArch64ISD::VLSHR:
24627 return performVectorShiftCombine(N, *this, DCI);
24629 return performSunpkloCombine(N, DAG);
24630 case AArch64ISD::BSP:
24631 return performBSPExpandForSVE(N, DAG, Subtarget);
24633 return performInsertVectorEltCombine(N, DCI);
24635 return performExtractVectorEltCombine(N, DCI, Subtarget);
24636 case ISD::VECREDUCE_ADD:
24637 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24638 case AArch64ISD::UADDV:
24639 return performUADDVCombine(N, DAG);
24640 case AArch64ISD::SMULL:
24641 case AArch64ISD::UMULL:
24642 case AArch64ISD::PMULL:
24643 return performMULLCombine(N, DCI, DAG);
24646 switch (N->getConstantOperandVal(1)) {
24647 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24648 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24649 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24650 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24651 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24652 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24653 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24654 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24655 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24656 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24657 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24658 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24659 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24660 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24661 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24662 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24664 case Intrinsic::aarch64_neon_ld2:
24665 case Intrinsic::aarch64_neon_ld3:
24666 case Intrinsic::aarch64_neon_ld4:
24667 case Intrinsic::aarch64_neon_ld1x2:
24668 case Intrinsic::aarch64_neon_ld1x3:
24669 case Intrinsic::aarch64_neon_ld1x4:
24670 case Intrinsic::aarch64_neon_ld2lane:
24671 case Intrinsic::aarch64_neon_ld3lane:
24672 case Intrinsic::aarch64_neon_ld4lane:
24673 case Intrinsic::aarch64_neon_ld2r:
24674 case Intrinsic::aarch64_neon_ld3r:
24675 case Intrinsic::aarch64_neon_ld4r:
24676 case Intrinsic::aarch64_neon_st2:
24677 case Intrinsic::aarch64_neon_st3:
24678 case Intrinsic::aarch64_neon_st4:
24679 case Intrinsic::aarch64_neon_st1x2:
24680 case Intrinsic::aarch64_neon_st1x3:
24681 case Intrinsic::aarch64_neon_st1x4:
24682 case Intrinsic::aarch64_neon_st2lane:
24683 case Intrinsic::aarch64_neon_st3lane:
24684 case Intrinsic::aarch64_neon_st4lane:
24685 return performNEONPostLDSTCombine(N, DCI, DAG);
24686 case Intrinsic::aarch64_sve_ldnt1:
24687 return performLDNT1Combine(N, DAG);
24688 case Intrinsic::aarch64_sve_ld1rq:
24689 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24690 case Intrinsic::aarch64_sve_ld1ro:
24691 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24692 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24694 case Intrinsic::aarch64_sve_ldnt1_gather:
24696 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24697 return performGatherLoadCombine(N, DAG,
24699 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24701 case Intrinsic::aarch64_sve_ld1:
24703 case Intrinsic::aarch64_sve_ldnf1:
24705 case Intrinsic::aarch64_sve_ldff1:
24707 case Intrinsic::aarch64_sve_st1:
24708 return performST1Combine(N, DAG);
24709 case Intrinsic::aarch64_sve_stnt1:
24710 return performSTNT1Combine(N, DAG);
24711 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24713 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24715 case Intrinsic::aarch64_sve_stnt1_scatter:
24717 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24719 case Intrinsic::aarch64_sve_ld1_gather:
24721 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24722 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24724 case Intrinsic::aarch64_sve_ld1q_gather_index:
24725 return performGatherLoadCombine(N, DAG,
24727 case Intrinsic::aarch64_sve_ld1_gather_index:
24728 return performGatherLoadCombine(N, DAG,
24730 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24732 /*OnlyPackedOffsets=*/false);
24733 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24735 /*OnlyPackedOffsets=*/false);
24736 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24737 return performGatherLoadCombine(N, DAG,
24739 /*OnlyPackedOffsets=*/false);
24740 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24741 return performGatherLoadCombine(N, DAG,
24743 /*OnlyPackedOffsets=*/false);
24744 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24746 case Intrinsic::aarch64_sve_ldff1_gather:
24748 case Intrinsic::aarch64_sve_ldff1_gather_index:
24749 return performGatherLoadCombine(N, DAG,
24751 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24752 return performGatherLoadCombine(N, DAG,
24754 /*OnlyPackedOffsets=*/false);
24755 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24756 return performGatherLoadCombine(N, DAG,
24758 /*OnlyPackedOffsets=*/false);
24759 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24760 return performGatherLoadCombine(N, DAG,
24762 /*OnlyPackedOffsets=*/false);
24763 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24764 return performGatherLoadCombine(N, DAG,
24766 /*OnlyPackedOffsets=*/false);
24767 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24768 return performGatherLoadCombine(N, DAG,
24770 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24771 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24773 case Intrinsic::aarch64_sve_st1q_scatter_index:
24775 case Intrinsic::aarch64_sve_st1_scatter:
24777 case Intrinsic::aarch64_sve_st1_scatter_index:
24779 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24781 /*OnlyPackedOffsets=*/false);
24782 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24784 /*OnlyPackedOffsets=*/false);
24785 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24786 return performScatterStoreCombine(N, DAG,
24788 /*OnlyPackedOffsets=*/false);
24789 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24790 return performScatterStoreCombine(N, DAG,
24792 /*OnlyPackedOffsets=*/false);
24793 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24795 case Intrinsic::aarch64_rndr:
24796 case Intrinsic::aarch64_rndrrs: {
24797 unsigned IntrinsicID = N->getConstantOperandVal(1);
24798 auto Register =
24799 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24800 : AArch64SysReg::RNDRRS);
24801 SDLoc DL(N);
24802 SDValue A = DAG.getNode(
24803 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24804 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24805 SDValue B = DAG.getNode(
24806 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24807 DAG.getConstant(0, DL, MVT::i32),
24808 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24809 return DAG.getMergeValues(
24810 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24811 }
24812 case Intrinsic::aarch64_sme_ldr_zt:
24814 DAG.getVTList(MVT::Other), N->getOperand(0),
24815 N->getOperand(2), N->getOperand(3));
24816 case Intrinsic::aarch64_sme_str_zt:
24817 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24818 DAG.getVTList(MVT::Other), N->getOperand(0),
24819 N->getOperand(2), N->getOperand(3));
24820 default:
24821 break;
24822 }
24823 break;
24824 case ISD::GlobalAddress:
24825 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24826 case ISD::CTLZ:
24827 return performCTLZCombine(N, DAG, Subtarget);
24829 return performScalarToVectorCombine(N, DCI, DAG);
24830 }
24831 return SDValue();
24832}
24833
24834// Check if the return value is used as only a return value, as otherwise
24835// we can't perform a tail-call. In particular, we need to check for
24836// target ISD nodes that are returns and any other "odd" constructs
24837// that the generic analysis code won't necessarily catch.
24838bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24839 SDValue &Chain) const {
24840 if (N->getNumValues() != 1)
24841 return false;
24842 if (!N->hasNUsesOfValue(1, 0))
24843 return false;
24844
24845 SDValue TCChain = Chain;
24846 SDNode *Copy = *N->use_begin();
24847 if (Copy->getOpcode() == ISD::CopyToReg) {
24848 // If the copy has a glue operand, we conservatively assume it isn't safe to
24849 // perform a tail call.
24850 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24851 MVT::Glue)
24852 return false;
24853 TCChain = Copy->getOperand(0);
24854 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24855 return false;
24856
24857 bool HasRet = false;
24858 for (SDNode *Node : Copy->uses()) {
24859 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24860 return false;
24861 HasRet = true;
24862 }
24863
24864 if (!HasRet)
24865 return false;
24866
24867 Chain = TCChain;
24868 return true;
24869}
24870
24871// Return whether the an instruction can potentially be optimized to a tail
24872// call. This will cause the optimizers to attempt to move, or duplicate,
24873// return instructions to help enable tail call optimizations for this
24874// instruction.
24875bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24876 return CI->isTailCall();
24877}
24878
24879bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24880 Register Offset, bool IsPre,
24881 MachineRegisterInfo &MRI) const {
24882 auto CstOffset = getIConstantVRegVal(Offset, MRI);
24883 if (!CstOffset || CstOffset->isZero())
24884 return false;
24885
24886 // All of the indexed addressing mode instructions take a signed 9 bit
24887 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24888 // encodes the sign/indexing direction.
24889 return isInt<9>(CstOffset->getSExtValue());
24890}
24891
24892bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24893 SDValue &Base,
24894 SDValue &Offset,
24895 SelectionDAG &DAG) const {
24896 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24897 return false;
24898
24899 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24900 SDNode *ValOnlyUser = nullptr;
24901 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24902 ++UI) {
24903 if (UI.getUse().getResNo() == 1)
24904 continue; // Ignore chain.
24905 if (ValOnlyUser == nullptr)
24906 ValOnlyUser = *UI;
24907 else {
24908 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24909 break;
24910 }
24911 }
24912
24913 auto IsUndefOrZero = [](SDValue V) {
24914 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24915 };
24916
24917 // If the only user of the value is a scalable vector splat, it is
24918 // preferable to do a replicating load (ld1r*).
24919 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
24920 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24921 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24922 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
24923 return false;
24924
24925 Base = Op->getOperand(0);
24926 // All of the indexed addressing mode instructions take a signed
24927 // 9 bit immediate offset.
24928 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
24929 int64_t RHSC = RHS->getSExtValue();
24930 if (Op->getOpcode() == ISD::SUB)
24931 RHSC = -(uint64_t)RHSC;
24932 if (!isInt<9>(RHSC))
24933 return false;
24934 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24935 // when dealing with subtraction.
24936 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
24937 return true;
24938 }
24939 return false;
24940}
24941
24942bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24943 SDValue &Offset,
24945 SelectionDAG &DAG) const {
24946 EVT VT;
24947 SDValue Ptr;
24948 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24949 VT = LD->getMemoryVT();
24950 Ptr = LD->getBasePtr();
24951 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24952 VT = ST->getMemoryVT();
24953 Ptr = ST->getBasePtr();
24954 } else
24955 return false;
24956
24957 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
24958 return false;
24959 AM = ISD::PRE_INC;
24960 return true;
24961}
24962
24963bool AArch64TargetLowering::getPostIndexedAddressParts(
24965 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
24966 EVT VT;
24967 SDValue Ptr;
24968 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24969 VT = LD->getMemoryVT();
24970 Ptr = LD->getBasePtr();
24971 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24972 VT = ST->getMemoryVT();
24973 Ptr = ST->getBasePtr();
24974 } else
24975 return false;
24976
24977 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
24978 return false;
24979 // Post-indexing updates the base, so it's not a valid transform
24980 // if that's not the same as the load's pointer.
24981 if (Ptr != Base)
24982 return false;
24983 AM = ISD::POST_INC;
24984 return true;
24985}
24986
24989 SelectionDAG &DAG) {
24990 SDLoc DL(N);
24991 SDValue Op = N->getOperand(0);
24992 EVT VT = N->getValueType(0);
24993 [[maybe_unused]] EVT SrcVT = Op.getValueType();
24994 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24995 "Must be bool vector.");
24996
24997 // Special handling for Clang's __builtin_convertvector. For vectors with <8
24998 // elements, it adds a vector concatenation with undef(s). If we encounter
24999 // this here, we can skip the concat.
25000 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25001 bool AllUndef = true;
25002 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25003 AllUndef &= Op.getOperand(I).isUndef();
25004
25005 if (AllUndef)
25006 Op = Op.getOperand(0);
25007 }
25008
25009 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25010 if (VectorBits)
25011 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25012}
25013
25016 SelectionDAG &DAG, EVT ExtendVT,
25017 EVT CastVT) {
25018 SDLoc DL(N);
25019 SDValue Op = N->getOperand(0);
25020 EVT VT = N->getValueType(0);
25021
25022 // Use SCALAR_TO_VECTOR for lane zero
25023 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25024 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25025 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25026 Results.push_back(
25027 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25028}
25029
25030void AArch64TargetLowering::ReplaceBITCASTResults(
25032 SDLoc DL(N);
25033 SDValue Op = N->getOperand(0);
25034 EVT VT = N->getValueType(0);
25035 EVT SrcVT = Op.getValueType();
25036
25037 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25038 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25039 return;
25040 }
25041
25042 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25043 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25044 return;
25045 }
25046
25047 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25048 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25049 return;
25050 }
25051
25052 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25053 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25054 "Expected fp->int bitcast!");
25055
25056 // Bitcasting between unpacked vector types of different element counts is
25057 // not a NOP because the live elements are laid out differently.
25058 // 01234567
25059 // e.g. nxv2i32 = XX??XX??
25060 // nxv4f16 = X?X?X?X?
25061 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25062 return;
25063
25064 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25065 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25066 return;
25067 }
25068
25069 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25070 !VT.isVector())
25071 return replaceBoolVectorBitcast(N, Results, DAG);
25072
25073 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25074 return;
25075
25076 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25077 DAG.getUNDEF(MVT::i32), Op);
25078 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25079 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25080}
25081
25083 SelectionDAG &DAG,
25084 const AArch64Subtarget *Subtarget) {
25085 EVT VT = N->getValueType(0);
25086 if (!VT.is256BitVector() ||
25088 !N->getFlags().hasAllowReassociation()) ||
25089 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25090 VT.getScalarType() == MVT::bf16)
25091 return;
25092
25093 SDValue X = N->getOperand(0);
25094 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25095 if (!Shuf) {
25096 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25097 X = N->getOperand(1);
25098 if (!Shuf)
25099 return;
25100 }
25101
25102 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25103 return;
25104
25105 // Check the mask is 1,0,3,2,5,4,...
25106 ArrayRef<int> Mask = Shuf->getMask();
25107 for (int I = 0, E = Mask.size(); I < E; I++)
25108 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25109 return;
25110
25111 SDLoc DL(N);
25112 auto LoHi = DAG.SplitVector(X, DL);
25113 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25114 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25115 LoHi.first, LoHi.second);
25116
25117 // Shuffle the elements back into order.
25118 SmallVector<int> NMask;
25119 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25120 NMask.push_back(I);
25121 NMask.push_back(I);
25122 }
25123 Results.push_back(
25124 DAG.getVectorShuffle(VT, DL,
25125 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25126 DAG.getUNDEF(LoHi.first.getValueType())),
25127 DAG.getUNDEF(VT), NMask));
25128}
25129
25132 SelectionDAG &DAG, unsigned InterOp,
25133 unsigned AcrossOp) {
25134 EVT LoVT, HiVT;
25135 SDValue Lo, Hi;
25136 SDLoc dl(N);
25137 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25138 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25139 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25140 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25141 Results.push_back(SplitVal);
25142}
25143
25144void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25146 SDValue In = N->getOperand(0);
25147 EVT InVT = In.getValueType();
25148
25149 // Common code will handle these just fine.
25150 if (!InVT.isScalableVector() || !InVT.isInteger())
25151 return;
25152
25153 SDLoc DL(N);
25154 EVT VT = N->getValueType(0);
25155
25156 // The following checks bail if this is not a halving operation.
25157
25159
25160 if (InVT.getVectorElementCount() != (ResEC * 2))
25161 return;
25162
25163 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25164 if (!CIndex)
25165 return;
25166
25167 unsigned Index = CIndex->getZExtValue();
25168 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25169 return;
25170
25171 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25172 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25173
25174 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25175 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25176}
25177
25178// Create an even/odd pair of X registers holding integer value V.
25180 SDLoc dl(V.getNode());
25181 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25182 if (DAG.getDataLayout().isBigEndian())
25183 std::swap (VLo, VHi);
25184 SDValue RegClass =
25185 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25186 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25187 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25188 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25189 return SDValue(
25190 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25191}
25192
25195 SelectionDAG &DAG,
25196 const AArch64Subtarget *Subtarget) {
25197 assert(N->getValueType(0) == MVT::i128 &&
25198 "AtomicCmpSwap on types less than 128 should be legal");
25199
25200 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25201 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25202 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25203 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25204 SDValue Ops[] = {
25205 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25206 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25207 N->getOperand(1), // Ptr
25208 N->getOperand(0), // Chain in
25209 };
25210
25211 unsigned Opcode;
25212 switch (MemOp->getMergedOrdering()) {
25214 Opcode = AArch64::CASPX;
25215 break;
25217 Opcode = AArch64::CASPAX;
25218 break;
25220 Opcode = AArch64::CASPLX;
25221 break;
25224 Opcode = AArch64::CASPALX;
25225 break;
25226 default:
25227 llvm_unreachable("Unexpected ordering!");
25228 }
25229
25230 MachineSDNode *CmpSwap = DAG.getMachineNode(
25231 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25232 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25233
25234 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25235 if (DAG.getDataLayout().isBigEndian())
25236 std::swap(SubReg1, SubReg2);
25237 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25238 SDValue(CmpSwap, 0));
25239 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25240 SDValue(CmpSwap, 0));
25241 Results.push_back(
25242 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25243 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25244 return;
25245 }
25246
25247 unsigned Opcode;
25248 switch (MemOp->getMergedOrdering()) {
25250 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25251 break;
25253 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25254 break;
25256 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25257 break;
25260 Opcode = AArch64::CMP_SWAP_128;
25261 break;
25262 default:
25263 llvm_unreachable("Unexpected ordering!");
25264 }
25265
25266 SDLoc DL(N);
25267 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25268 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25269 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25270 New.first, New.second, N->getOperand(0)};
25271 SDNode *CmpSwap = DAG.getMachineNode(
25272 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25273 Ops);
25274 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25275
25276 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25277 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25278 Results.push_back(SDValue(CmpSwap, 3));
25279}
25280
25281static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25282 AtomicOrdering Ordering) {
25283 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25284 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25285 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25286 // ATOMIC_LOAD_CLR at any point.
25287 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25288 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25289 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25290 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25291
25292 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25293 // The operand will need to be XORed in a separate step.
25294 switch (Ordering) {
25296 return AArch64::LDCLRP;
25297 break;
25299 return AArch64::LDCLRPA;
25300 break;
25302 return AArch64::LDCLRPL;
25303 break;
25306 return AArch64::LDCLRPAL;
25307 break;
25308 default:
25309 llvm_unreachable("Unexpected ordering!");
25310 }
25311 }
25312
25313 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25314 switch (Ordering) {
25316 return AArch64::LDSETP;
25317 break;
25319 return AArch64::LDSETPA;
25320 break;
25322 return AArch64::LDSETPL;
25323 break;
25326 return AArch64::LDSETPAL;
25327 break;
25328 default:
25329 llvm_unreachable("Unexpected ordering!");
25330 }
25331 }
25332
25333 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25334 switch (Ordering) {
25336 return AArch64::SWPP;
25337 break;
25339 return AArch64::SWPPA;
25340 break;
25342 return AArch64::SWPPL;
25343 break;
25346 return AArch64::SWPPAL;
25347 break;
25348 default:
25349 llvm_unreachable("Unexpected ordering!");
25350 }
25351 }
25352
25353 llvm_unreachable("Unexpected ISDOpcode!");
25354}
25355
25358 SelectionDAG &DAG,
25359 const AArch64Subtarget *Subtarget) {
25360 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25361 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25362 // rather than the CASP instructions, because CASP has register classes for
25363 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25364 // to present them as single operands. LSE128 instructions use the GPR64
25365 // register class (because the pair does not have to be sequential), like
25366 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25367
25368 assert(N->getValueType(0) == MVT::i128 &&
25369 "AtomicLoadXXX on types less than 128 should be legal");
25370
25371 if (!Subtarget->hasLSE128())
25372 return;
25373
25374 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25375 const SDValue &Chain = N->getOperand(0);
25376 const SDValue &Ptr = N->getOperand(1);
25377 const SDValue &Val128 = N->getOperand(2);
25378 std::pair<SDValue, SDValue> Val2x64 =
25379 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25380
25381 const unsigned ISDOpcode = N->getOpcode();
25382 const unsigned MachineOpcode =
25383 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25384
25385 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25386 SDLoc dl(Val128);
25387 Val2x64.first =
25388 DAG.getNode(ISD::XOR, dl, MVT::i64,
25389 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25390 Val2x64.second =
25391 DAG.getNode(ISD::XOR, dl, MVT::i64,
25392 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25393 }
25394
25395 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25396 if (DAG.getDataLayout().isBigEndian())
25397 std::swap(Ops[0], Ops[1]);
25398
25399 MachineSDNode *AtomicInst =
25400 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25401 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25402
25403 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25404
25405 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25406 if (DAG.getDataLayout().isBigEndian())
25407 std::swap(Lo, Hi);
25408
25409 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25410 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25411}
25412
25413void AArch64TargetLowering::ReplaceNodeResults(
25415 switch (N->getOpcode()) {
25416 default:
25417 llvm_unreachable("Don't know how to custom expand this");
25418 case ISD::BITCAST:
25419 ReplaceBITCASTResults(N, Results, DAG);
25420 return;
25421 case ISD::VECREDUCE_ADD:
25426 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25427 return;
25428 case ISD::ADD:
25429 case ISD::FADD:
25430 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25431 return;
25432
25433 case ISD::CTPOP:
25434 case ISD::PARITY:
25435 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25436 Results.push_back(Result);
25437 return;
25438 case AArch64ISD::SADDV:
25440 return;
25441 case AArch64ISD::UADDV:
25443 return;
25444 case AArch64ISD::SMINV:
25446 return;
25447 case AArch64ISD::UMINV:
25449 return;
25450 case AArch64ISD::SMAXV:
25452 return;
25453 case AArch64ISD::UMAXV:
25455 return;
25456 case ISD::MULHS:
25458 Results.push_back(
25459 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25460 return;
25461 case ISD::MULHU:
25463 Results.push_back(
25464 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25465 return;
25466 case ISD::FP_TO_UINT:
25467 case ISD::FP_TO_SINT:
25470 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25471 // Let normal code take care of it by not adding anything to Results.
25472 return;
25474 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25475 return;
25477 assert(N->getValueType(0) != MVT::i128 &&
25478 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25479 break;
25482 case ISD::ATOMIC_SWAP: {
25483 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25484 "Expected 128-bit atomicrmw.");
25485 // These need custom type legalisation so we go directly to instruction.
25486 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25487 return;
25488 }
25489 case ISD::ATOMIC_LOAD:
25490 case ISD::LOAD: {
25491 MemSDNode *LoadNode = cast<MemSDNode>(N);
25492 EVT MemVT = LoadNode->getMemoryVT();
25493 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25494 // targets.
25495 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25496 MemVT.getSizeInBits() == 256u &&
25497 (MemVT.getScalarSizeInBits() == 8u ||
25498 MemVT.getScalarSizeInBits() == 16u ||
25499 MemVT.getScalarSizeInBits() == 32u ||
25500 MemVT.getScalarSizeInBits() == 64u)) {
25501
25504 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25505 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25506 MVT::Other}),
25507 {LoadNode->getChain(), LoadNode->getBasePtr()},
25508 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25509
25510 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25511 Result.getValue(0), Result.getValue(1));
25512 Results.append({Pair, Result.getValue(2) /* Chain */});
25513 return;
25514 }
25515
25516 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25517 LoadNode->getMemoryVT() != MVT::i128) {
25518 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25519 // optimizer.
25520 return;
25521 }
25522
25523 if (SDValue(N, 0).getValueType() == MVT::i128) {
25524 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25525 bool isLoadAcquire =
25527 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25528
25529 if (isLoadAcquire)
25530 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25531
25533 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25534 {LoadNode->getChain(), LoadNode->getBasePtr()},
25535 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25536
25537 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25538
25539 SDValue Pair =
25540 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25541 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25542 Results.append({Pair, Result.getValue(2) /* Chain */});
25543 }
25544 return;
25545 }
25547 ReplaceExtractSubVectorResults(N, Results, DAG);
25548 return;
25551 // Custom lowering has been requested for INSERT_SUBVECTOR and
25552 // CONCAT_VECTORS -- but delegate to common code for result type
25553 // legalisation
25554 return;
25556 EVT VT = N->getValueType(0);
25557 assert((VT == MVT::i8 || VT == MVT::i16) &&
25558 "custom lowering for unexpected type");
25559
25560 Intrinsic::ID IntID =
25561 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25562 switch (IntID) {
25563 default:
25564 return;
25565 case Intrinsic::aarch64_sve_clasta_n: {
25566 SDLoc DL(N);
25567 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25568 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25569 N->getOperand(1), Op2, N->getOperand(3));
25570 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25571 return;
25572 }
25573 case Intrinsic::aarch64_sve_clastb_n: {
25574 SDLoc DL(N);
25575 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25576 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25577 N->getOperand(1), Op2, N->getOperand(3));
25578 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25579 return;
25580 }
25581 case Intrinsic::aarch64_sve_lasta: {
25582 SDLoc DL(N);
25583 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25584 N->getOperand(1), N->getOperand(2));
25585 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25586 return;
25587 }
25588 case Intrinsic::aarch64_sve_lastb: {
25589 SDLoc DL(N);
25590 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25591 N->getOperand(1), N->getOperand(2));
25592 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25593 return;
25594 }
25595 }
25596 }
25597 case ISD::READ_REGISTER: {
25598 SDLoc DL(N);
25599 assert(N->getValueType(0) == MVT::i128 &&
25600 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25601 SDValue Chain = N->getOperand(0);
25602 SDValue SysRegName = N->getOperand(1);
25603
25604 SDValue Result = DAG.getNode(
25605 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25606 Chain, SysRegName);
25607
25608 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25609 // of the 128-bit System Register value.
25610 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25611 Result.getValue(0), Result.getValue(1));
25612 Results.push_back(Pair);
25613 Results.push_back(Result.getValue(2)); // Chain
25614 return;
25615 }
25616 }
25617}
25618
25620 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25622 return true;
25623}
25624
25625unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25626 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25627 // reciprocal if there are three or more FDIVs.
25628 return 3;
25629}
25630
25633 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25634 // v4i16, v2i32 instead of to promote.
25635 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25636 VT == MVT::v1f32)
25637 return TypeWidenVector;
25638
25640}
25641
25642// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25643// provided the address is 16-byte aligned.
25645 if (!Subtarget->hasLSE2())
25646 return false;
25647
25648 if (auto LI = dyn_cast<LoadInst>(I))
25649 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25650 LI->getAlign() >= Align(16);
25651
25652 if (auto SI = dyn_cast<StoreInst>(I))
25653 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25654 SI->getAlign() >= Align(16);
25655
25656 return false;
25657}
25658
25660 if (!Subtarget->hasLSE128())
25661 return false;
25662
25663 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25664 // will clobber the two registers.
25665 if (const auto *SI = dyn_cast<StoreInst>(I))
25666 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25667 SI->getAlign() >= Align(16) &&
25668 (SI->getOrdering() == AtomicOrdering::Release ||
25669 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25670
25671 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25672 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25673 RMW->getAlign() >= Align(16) &&
25674 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25675 RMW->getOperation() == AtomicRMWInst::And ||
25676 RMW->getOperation() == AtomicRMWInst::Or);
25677
25678 return false;
25679}
25680
25682 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25683 return false;
25684
25685 if (auto LI = dyn_cast<LoadInst>(I))
25686 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25687 LI->getAlign() >= Align(16) &&
25688 LI->getOrdering() == AtomicOrdering::Acquire;
25689
25690 if (auto SI = dyn_cast<StoreInst>(I))
25691 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25692 SI->getAlign() >= Align(16) &&
25693 SI->getOrdering() == AtomicOrdering::Release;
25694
25695 return false;
25696}
25697
25699 const Instruction *I) const {
25701 return false;
25703 return false;
25705 return true;
25706 return false;
25707}
25708
25710 const Instruction *I) const {
25711 // Store-Release instructions only provide seq_cst guarantees when paired with
25712 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25713 // implement seq_cst loads and stores, so we need additional explicit fences
25714 // after memory writes.
25715 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25716 return false;
25717
25718 switch (I->getOpcode()) {
25719 default:
25720 return false;
25721 case Instruction::AtomicCmpXchg:
25722 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25724 case Instruction::AtomicRMW:
25725 return cast<AtomicRMWInst>(I)->getOrdering() ==
25727 case Instruction::Store:
25728 return cast<StoreInst>(I)->getOrdering() ==
25730 }
25731}
25732
25733// Loads and stores less than 128-bits are already atomic; ones above that
25734// are doomed anyway, so defer to the default libcall and blame the OS when
25735// things go wrong.
25738 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25739 if (Size != 128)
25741 if (isOpSuitableForRCPC3(SI))
25743 if (isOpSuitableForLSE128(SI))
25745 if (isOpSuitableForLDPSTP(SI))
25748}
25749
25750// Loads and stores less than 128-bits are already atomic; ones above that
25751// are doomed anyway, so defer to the default libcall and blame the OS when
25752// things go wrong.
25755 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25756
25757 if (Size != 128)
25759 if (isOpSuitableForRCPC3(LI))
25761 // No LSE128 loads
25762 if (isOpSuitableForLDPSTP(LI))
25764
25765 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25766 // implement atomicrmw without spilling. If the target address is also on the
25767 // stack and close enough to the spill slot, this can lead to a situation
25768 // where the monitor always gets cleared and the atomic operation can never
25769 // succeed. So at -O0 lower this operation to a CAS loop.
25770 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25772
25773 // Using CAS for an atomic load has a better chance of succeeding under high
25774 // contention situations. So use it if available.
25775 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25777}
25778
25779// The "default" for integer RMW operations is to expand to an LL/SC loop.
25780// However, with the LSE instructions (or outline-atomics mode, which provides
25781// library routines in place of the LSE-instructions), we can directly emit many
25782// operations instead.
25783//
25784// Floating-point operations are always emitted to a cmpxchg loop, because they
25785// may trigger a trap which aborts an LLSC sequence.
25788 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25789 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25790
25791 if (AI->isFloatingPointOperation())
25793
25794 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25798 if (CanUseLSE128)
25800
25801 // Nand is not supported in LSE.
25802 // Leave 128 bits to LLSC or CmpXChg.
25803 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25804 if (Subtarget->hasLSE())
25806 if (Subtarget->outlineAtomics()) {
25807 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25808 // Don't outline them unless
25809 // (1) high level <atomic> support approved:
25810 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25811 // (2) low level libgcc and compiler-rt support implemented by:
25812 // min/max outline atomics helpers
25813 if (AI->getOperation() != AtomicRMWInst::Min &&
25818 }
25819 }
25820 }
25821
25822 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25823 // implement atomicrmw without spilling. If the target address is also on the
25824 // stack and close enough to the spill slot, this can lead to a situation
25825 // where the monitor always gets cleared and the atomic operation can never
25826 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25827 // we have a single CAS instruction that can replace the loop.
25829 Subtarget->hasLSE())
25831
25833}
25834
25837 AtomicCmpXchgInst *AI) const {
25838 // If subtarget has LSE, leave cmpxchg intact for codegen.
25839 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25841 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25842 // implement cmpxchg without spilling. If the address being exchanged is also
25843 // on the stack and close enough to the spill slot, this can lead to a
25844 // situation where the monitor always gets cleared and the atomic operation
25845 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25846 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25848
25849 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25850 // it.
25852 if (Size > 64)
25854
25856}
25857
25859 Type *ValueTy, Value *Addr,
25860 AtomicOrdering Ord) const {
25861 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25862 bool IsAcquire = isAcquireOrStronger(Ord);
25863
25864 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25865 // intrinsic must return {i64, i64} and we have to recombine them into a
25866 // single i128 here.
25867 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25869 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25871
25872 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25873
25874 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25875 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25876 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25877 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25878 return Builder.CreateOr(
25879 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25880 }
25881
25882 Type *Tys[] = { Addr->getType() };
25884 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25885 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25886
25887 const DataLayout &DL = M->getDataLayout();
25888 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25889 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25890 CI->addParamAttr(
25891 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25892 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25893
25894 return Builder.CreateBitCast(Trunc, ValueTy);
25895}
25896
25898 IRBuilderBase &Builder) const {
25899 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25900 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25901}
25902
25904 Value *Val, Value *Addr,
25905 AtomicOrdering Ord) const {
25906 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25907 bool IsRelease = isReleaseOrStronger(Ord);
25908
25909 // Since the intrinsics must have legal type, the i128 intrinsics take two
25910 // parameters: "i64, i64". We must marshal Val into the appropriate form
25911 // before the call.
25912 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25914 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25916 Type *Int64Ty = Type::getInt64Ty(M->getContext());
25917
25918 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
25919 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
25920 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
25921 }
25922
25924 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25925 Type *Tys[] = { Addr->getType() };
25926 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
25927
25928 const DataLayout &DL = M->getDataLayout();
25929 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
25930 Val = Builder.CreateBitCast(Val, IntValTy);
25931
25932 CallInst *CI = Builder.CreateCall(
25933 Stxr, {Builder.CreateZExtOrBitCast(
25934 Val, Stxr->getFunctionType()->getParamType(0)),
25935 Addr});
25936 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
25937 Attribute::ElementType, Val->getType()));
25938 return CI;
25939}
25940
25942 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
25943 const DataLayout &DL) const {
25944 if (!Ty->isArrayTy()) {
25945 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
25946 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
25947 }
25948
25949 // All non aggregate members of the type must have the same type
25950 SmallVector<EVT> ValueVTs;
25951 ComputeValueVTs(*this, DL, Ty, ValueVTs);
25952 return all_equal(ValueVTs);
25953}
25954
25955bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
25956 EVT) const {
25957 return false;
25958}
25959
25960static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
25961 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
25962 Function *ThreadPointerFunc =
25963 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
25964 return IRB.CreatePointerCast(
25965 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
25966 Offset),
25967 IRB.getPtrTy(0));
25968}
25969
25971 // Android provides a fixed TLS slot for the stack cookie. See the definition
25972 // of TLS_SLOT_STACK_GUARD in
25973 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
25974 if (Subtarget->isTargetAndroid())
25975 return UseTlsOffset(IRB, 0x28);
25976
25977 // Fuchsia is similar.
25978 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
25979 if (Subtarget->isTargetFuchsia())
25980 return UseTlsOffset(IRB, -0x10);
25981
25983}
25984
25986 // MSVC CRT provides functionalities for stack protection.
25987 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
25988 // MSVC CRT has a global variable holding security cookie.
25989 M.getOrInsertGlobal("__security_cookie",
25990 PointerType::getUnqual(M.getContext()));
25991
25992 // MSVC CRT has a function to validate security cookie.
25993 FunctionCallee SecurityCheckCookie =
25994 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
25995 Type::getVoidTy(M.getContext()),
25996 PointerType::getUnqual(M.getContext()));
25997 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
25998 F->setCallingConv(CallingConv::Win64);
25999 F->addParamAttr(0, Attribute::AttrKind::InReg);
26000 }
26001 return;
26002 }
26004}
26005
26007 // MSVC CRT has a global variable holding security cookie.
26008 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26009 return M.getGlobalVariable("__security_cookie");
26011}
26012
26014 // MSVC CRT has a function to validate security cookie.
26015 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26016 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26018}
26019
26020Value *
26022 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26023 // definition of TLS_SLOT_SAFESTACK in
26024 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26025 if (Subtarget->isTargetAndroid())
26026 return UseTlsOffset(IRB, 0x48);
26027
26028 // Fuchsia is similar.
26029 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26030 if (Subtarget->isTargetFuchsia())
26031 return UseTlsOffset(IRB, -0x8);
26032
26034}
26035
26037 const Instruction &AndI) const {
26038 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26039 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26040 // may be beneficial to sink in other cases, but we would have to check that
26041 // the cmp would not get folded into the br to form a cbz for these to be
26042 // beneficial.
26043 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26044 if (!Mask)
26045 return false;
26046 return Mask->getValue().isPowerOf2();
26047}
26048
26052 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26053 SelectionDAG &DAG) const {
26054 // Does baseline recommend not to perform the fold by default?
26056 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26057 return false;
26058 // Else, if this is a vector shift, prefer 'shl'.
26059 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26060}
26061
26064 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26066 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26069 ExpansionFactor);
26070}
26071
26073 // Update IsSplitCSR in AArch64unctionInfo.
26074 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26075 AFI->setIsSplitCSR(true);
26076}
26077
26079 MachineBasicBlock *Entry,
26080 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26081 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26082 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26083 if (!IStart)
26084 return;
26085
26086 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26087 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26088 MachineBasicBlock::iterator MBBI = Entry->begin();
26089 for (const MCPhysReg *I = IStart; *I; ++I) {
26090 const TargetRegisterClass *RC = nullptr;
26091 if (AArch64::GPR64RegClass.contains(*I))
26092 RC = &AArch64::GPR64RegClass;
26093 else if (AArch64::FPR64RegClass.contains(*I))
26094 RC = &AArch64::FPR64RegClass;
26095 else
26096 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26097
26098 Register NewVR = MRI->createVirtualRegister(RC);
26099 // Create copy from CSR to a virtual register.
26100 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26101 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26102 // nounwind. If we want to generalize this later, we may need to emit
26103 // CFI pseudo-instructions.
26104 assert(Entry->getParent()->getFunction().hasFnAttribute(
26105 Attribute::NoUnwind) &&
26106 "Function should be nounwind in insertCopiesSplitCSR!");
26107 Entry->addLiveIn(*I);
26108 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26109 .addReg(*I);
26110
26111 // Insert the copy-back instructions right before the terminator.
26112 for (auto *Exit : Exits)
26113 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26114 TII->get(TargetOpcode::COPY), *I)
26115 .addReg(NewVR);
26116 }
26117}
26118
26120 // Integer division on AArch64 is expensive. However, when aggressively
26121 // optimizing for code size, we prefer to use a div instruction, as it is
26122 // usually smaller than the alternative sequence.
26123 // The exception to this is vector division. Since AArch64 doesn't have vector
26124 // integer division, leaving the division as-is is a loss even in terms of
26125 // size, because it will have to be scalarized, while the alternative code
26126 // sequence can be performed in vector form.
26127 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26128 return OptSize && !VT.isVector();
26129}
26130
26132 // We want inc-of-add for scalars and sub-of-not for vectors.
26133 return VT.isScalarInteger();
26134}
26135
26137 EVT VT) const {
26138 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26139 // legalize.
26140 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26141 return false;
26142 if (FPVT == MVT::v8bf16)
26143 return false;
26144 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26145}
26146
26150 const TargetInstrInfo *TII) const {
26151 assert(MBBI->isCall() && MBBI->getCFIType() &&
26152 "Invalid call instruction for a KCFI check");
26153
26154 switch (MBBI->getOpcode()) {
26155 case AArch64::BLR:
26156 case AArch64::BLRNoIP:
26157 case AArch64::TCRETURNri:
26158 case AArch64::TCRETURNrix16x17:
26159 case AArch64::TCRETURNrix17:
26160 case AArch64::TCRETURNrinotx16:
26161 break;
26162 default:
26163 llvm_unreachable("Unexpected CFI call opcode");
26164 }
26165
26166 MachineOperand &Target = MBBI->getOperand(0);
26167 assert(Target.isReg() && "Invalid target operand for an indirect call");
26168 Target.setIsRenamable(false);
26169
26170 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26171 .addReg(Target.getReg())
26172 .addImm(MBBI->getCFIType())
26173 .getInstr();
26174}
26175
26177 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26178}
26179
26180unsigned
26182 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26183 return getPointerTy(DL).getSizeInBits();
26184
26185 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26186}
26187
26188void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26189 MachineFrameInfo &MFI = MF.getFrameInfo();
26190 // If we have any vulnerable SVE stack objects then the stack protector
26191 // needs to be placed at the top of the SVE stack area, as the SVE locals
26192 // are placed above the other locals, so we allocate it as if it were a
26193 // scalable vector.
26194 // FIXME: It may be worthwhile having a specific interface for this rather
26195 // than doing it here in finalizeLowering.
26196 if (MFI.hasStackProtectorIndex()) {
26197 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26203 break;
26204 }
26205 }
26206 }
26209}
26210
26211// Unlike X86, we let frame lowering assign offsets to all catch objects.
26213 return false;
26214}
26215
26216bool AArch64TargetLowering::shouldLocalize(
26217 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26218 auto &MF = *MI.getMF();
26219 auto &MRI = MF.getRegInfo();
26220 auto maxUses = [](unsigned RematCost) {
26221 // A cost of 1 means remats are basically free.
26222 if (RematCost == 1)
26223 return std::numeric_limits<unsigned>::max();
26224 if (RematCost == 2)
26225 return 2U;
26226
26227 // Remat is too expensive, only sink if there's one user.
26228 if (RematCost > 2)
26229 return 1U;
26230 llvm_unreachable("Unexpected remat cost");
26231 };
26232
26233 unsigned Opc = MI.getOpcode();
26234 switch (Opc) {
26235 case TargetOpcode::G_GLOBAL_VALUE: {
26236 // On Darwin, TLS global vars get selected into function calls, which
26237 // we don't want localized, as they can get moved into the middle of a
26238 // another call sequence.
26239 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26240 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26241 return false;
26242 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26243 }
26244 case TargetOpcode::G_FCONSTANT:
26245 case TargetOpcode::G_CONSTANT: {
26246 const ConstantInt *CI;
26247 unsigned AdditionalCost = 0;
26248
26249 if (Opc == TargetOpcode::G_CONSTANT)
26250 CI = MI.getOperand(1).getCImm();
26251 else {
26252 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26253 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26254 // materialized as integers.
26255 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26256 break;
26257 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26258 bool OptForSize =
26261 OptForSize))
26262 return true; // Constant should be cheap.
26263 CI =
26264 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26265 // FP materialization also costs an extra move, from gpr to fpr.
26266 AdditionalCost = 1;
26267 }
26268 APInt Imm = CI->getValue();
26271 assert(Cost.isValid() && "Expected a valid imm cost");
26272
26273 unsigned RematCost = *Cost.getValue();
26274 RematCost += AdditionalCost;
26275 Register Reg = MI.getOperand(0).getReg();
26276 unsigned MaxUses = maxUses(RematCost);
26277 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26278 if (MaxUses == std::numeric_limits<unsigned>::max())
26279 --MaxUses;
26280 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26281 }
26282 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26283 // localizable.
26284 case AArch64::ADRP:
26285 case AArch64::G_ADD_LOW:
26286 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26287 case TargetOpcode::G_PTR_ADD:
26288 return true;
26289 default:
26290 break;
26291 }
26293}
26294
26296 if (Inst.getType()->isScalableTy())
26297 return true;
26298
26299 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26300 if (Inst.getOperand(i)->getType()->isScalableTy())
26301 return true;
26302
26303 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26304 if (AI->getAllocatedType()->isScalableTy())
26305 return true;
26306 }
26307
26308 // Checks to allow the use of SME instructions
26309 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26310 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26311 auto CalleeAttrs = SMEAttrs(*Base);
26312 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26313 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26314 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26315 return true;
26316 }
26317 return false;
26318}
26319
26320// Return the largest legal scalable vector type that matches VT's element type.
26324 "Expected legal fixed length vector!");
26325 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26326 default:
26327 llvm_unreachable("unexpected element type for SVE container");
26328 case MVT::i8:
26329 return EVT(MVT::nxv16i8);
26330 case MVT::i16:
26331 return EVT(MVT::nxv8i16);
26332 case MVT::i32:
26333 return EVT(MVT::nxv4i32);
26334 case MVT::i64:
26335 return EVT(MVT::nxv2i64);
26336 case MVT::bf16:
26337 return EVT(MVT::nxv8bf16);
26338 case MVT::f16:
26339 return EVT(MVT::nxv8f16);
26340 case MVT::f32:
26341 return EVT(MVT::nxv4f32);
26342 case MVT::f64:
26343 return EVT(MVT::nxv2f64);
26344 }
26345}
26346
26347// Return a PTRUE with active lanes corresponding to the extent of VT.
26349 EVT VT) {
26352 "Expected legal fixed length vector!");
26353
26354 std::optional<unsigned> PgPattern =
26356 assert(PgPattern && "Unexpected element count for SVE predicate");
26357
26358 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26359 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26360 // variants of instructions when available.
26361 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26362 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26363 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26364 if (MaxSVESize && MinSVESize == MaxSVESize &&
26365 MaxSVESize == VT.getSizeInBits())
26366 PgPattern = AArch64SVEPredPattern::all;
26367
26368 MVT MaskVT;
26369 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26370 default:
26371 llvm_unreachable("unexpected element type for SVE predicate");
26372 case MVT::i8:
26373 MaskVT = MVT::nxv16i1;
26374 break;
26375 case MVT::i16:
26376 case MVT::f16:
26377 case MVT::bf16:
26378 MaskVT = MVT::nxv8i1;
26379 break;
26380 case MVT::i32:
26381 case MVT::f32:
26382 MaskVT = MVT::nxv4i1;
26383 break;
26384 case MVT::i64:
26385 case MVT::f64:
26386 MaskVT = MVT::nxv2i1;
26387 break;
26388 }
26389
26390 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26391}
26392
26394 EVT VT) {
26396 "Expected legal scalable vector!");
26397 auto PredTy = VT.changeVectorElementType(MVT::i1);
26398 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26399}
26400
26402 if (VT.isFixedLengthVector())
26403 return getPredicateForFixedLengthVector(DAG, DL, VT);
26404
26405 return getPredicateForScalableVector(DAG, DL, VT);
26406}
26407
26408// Grow V to consume an entire SVE register.
26410 assert(VT.isScalableVector() &&
26411 "Expected to convert into a scalable vector!");
26412 assert(V.getValueType().isFixedLengthVector() &&
26413 "Expected a fixed length vector operand!");
26414 SDLoc DL(V);
26415 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26416 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26417}
26418
26419// Shrink V so it's just big enough to maintain a VT's worth of data.
26422 "Expected to convert into a fixed length vector!");
26423 assert(V.getValueType().isScalableVector() &&
26424 "Expected a scalable vector operand!");
26425 SDLoc DL(V);
26426 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26427 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26428}
26429
26430// Convert all fixed length vector loads larger than NEON to masked_loads.
26431SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26432 SDValue Op, SelectionDAG &DAG) const {
26433 auto Load = cast<LoadSDNode>(Op);
26434
26435 SDLoc DL(Op);
26436 EVT VT = Op.getValueType();
26437 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26438 EVT LoadVT = ContainerVT;
26439 EVT MemVT = Load->getMemoryVT();
26440
26441 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26442
26443 if (VT.isFloatingPoint()) {
26444 LoadVT = ContainerVT.changeTypeToInteger();
26445 MemVT = MemVT.changeTypeToInteger();
26446 }
26447
26448 SDValue NewLoad = DAG.getMaskedLoad(
26449 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26450 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26451 Load->getAddressingMode(), Load->getExtensionType());
26452
26453 SDValue Result = NewLoad;
26454 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26455 EVT ExtendVT = ContainerVT.changeVectorElementType(
26456 Load->getMemoryVT().getVectorElementType());
26457
26458 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26460 Pg, Result, DAG.getUNDEF(ContainerVT));
26461 } else if (VT.isFloatingPoint()) {
26462 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26463 }
26464
26465 Result = convertFromScalableVector(DAG, VT, Result);
26466 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26467 return DAG.getMergeValues(MergedValues, DL);
26468}
26469
26471 SelectionDAG &DAG) {
26472 SDLoc DL(Mask);
26473 EVT InVT = Mask.getValueType();
26474 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26475
26476 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26477
26478 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26479 return Pg;
26480
26481 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26482 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26483
26485 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26486}
26487
26488// Convert all fixed length vector loads larger than NEON to masked_loads.
26489SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26490 SDValue Op, SelectionDAG &DAG) const {
26491 auto Load = cast<MaskedLoadSDNode>(Op);
26492
26493 SDLoc DL(Op);
26494 EVT VT = Op.getValueType();
26495 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26496
26497 SDValue Mask = Load->getMask();
26498 // If this is an extending load and the mask type is not the same as
26499 // load's type then we have to extend the mask type.
26500 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26501 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26502 "Incorrect mask type");
26503 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26504 }
26506
26507 SDValue PassThru;
26508 bool IsPassThruZeroOrUndef = false;
26509
26510 if (Load->getPassThru()->isUndef()) {
26511 PassThru = DAG.getUNDEF(ContainerVT);
26512 IsPassThruZeroOrUndef = true;
26513 } else {
26514 if (ContainerVT.isInteger())
26515 PassThru = DAG.getConstant(0, DL, ContainerVT);
26516 else
26517 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26518 if (isZerosVector(Load->getPassThru().getNode()))
26519 IsPassThruZeroOrUndef = true;
26520 }
26521
26522 SDValue NewLoad = DAG.getMaskedLoad(
26523 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26524 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26525 Load->getAddressingMode(), Load->getExtensionType());
26526
26527 SDValue Result = NewLoad;
26528 if (!IsPassThruZeroOrUndef) {
26529 SDValue OldPassThru =
26530 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26531 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26532 }
26533
26534 Result = convertFromScalableVector(DAG, VT, Result);
26535 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26536 return DAG.getMergeValues(MergedValues, DL);
26537}
26538
26539// Convert all fixed length vector stores larger than NEON to masked_stores.
26540SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26541 SDValue Op, SelectionDAG &DAG) const {
26542 auto Store = cast<StoreSDNode>(Op);
26543
26544 SDLoc DL(Op);
26545 EVT VT = Store->getValue().getValueType();
26546 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26547 EVT MemVT = Store->getMemoryVT();
26548
26549 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26550 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26551
26552 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26553 EVT TruncVT = ContainerVT.changeVectorElementType(
26554 Store->getMemoryVT().getVectorElementType());
26555 MemVT = MemVT.changeTypeToInteger();
26556 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26557 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26558 DAG.getUNDEF(TruncVT));
26559 NewValue =
26560 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26561 } else if (VT.isFloatingPoint()) {
26562 MemVT = MemVT.changeTypeToInteger();
26563 NewValue =
26564 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26565 }
26566
26567 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26568 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26569 Store->getMemOperand(), Store->getAddressingMode(),
26570 Store->isTruncatingStore());
26571}
26572
26573SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26574 SDValue Op, SelectionDAG &DAG) const {
26575 auto *Store = cast<MaskedStoreSDNode>(Op);
26576
26577 SDLoc DL(Op);
26578 EVT VT = Store->getValue().getValueType();
26579 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26580
26581 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26583
26584 return DAG.getMaskedStore(
26585 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26586 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26587 Store->getAddressingMode(), Store->isTruncatingStore());
26588}
26589
26590SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26591 SDValue Op, SelectionDAG &DAG) const {
26592 SDLoc dl(Op);
26593 EVT VT = Op.getValueType();
26594 EVT EltVT = VT.getVectorElementType();
26595
26596 bool Signed = Op.getOpcode() == ISD::SDIV;
26597 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26598
26599 bool Negated;
26600 uint64_t SplatVal;
26601 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26602 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26603 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26604 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26605
26606 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26607 SDValue Res =
26608 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26609 if (Negated)
26610 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26611 DAG.getConstant(0, dl, ContainerVT), Res);
26612
26613 return convertFromScalableVector(DAG, VT, Res);
26614 }
26615
26616 // Scalable vector i32/i64 DIV is supported.
26617 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26618 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26619
26620 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26621 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26622 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26623 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26624
26625 // If the wider type is legal: extend, op, and truncate.
26626 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26627 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26628 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26629 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26630 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26631 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26632 }
26633
26634 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26635 &ExtendOpcode](SDValue Op) {
26636 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26637 SDValue IdxHalf =
26638 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26639 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26640 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26641 return std::pair<SDValue, SDValue>(
26642 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26643 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26644 };
26645
26646 // If wider type is not legal: split, extend, op, trunc and concat.
26647 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26648 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26649 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26650 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26651 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26652 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26653 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26654}
26655
26656SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26657 SDValue Op, SelectionDAG &DAG) const {
26658 EVT VT = Op.getValueType();
26659 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26660
26661 SDLoc DL(Op);
26662 SDValue Val = Op.getOperand(0);
26663 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26664 Val = convertToScalableVector(DAG, ContainerVT, Val);
26665
26666 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26667 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26668
26669 // Repeatedly unpack Val until the result is of the desired element type.
26670 switch (ContainerVT.getSimpleVT().SimpleTy) {
26671 default:
26672 llvm_unreachable("unimplemented container type");
26673 case MVT::nxv16i8:
26674 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26675 if (VT.getVectorElementType() == MVT::i16)
26676 break;
26677 [[fallthrough]];
26678 case MVT::nxv8i16:
26679 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26680 if (VT.getVectorElementType() == MVT::i32)
26681 break;
26682 [[fallthrough]];
26683 case MVT::nxv4i32:
26684 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26685 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26686 break;
26687 }
26688
26689 return convertFromScalableVector(DAG, VT, Val);
26690}
26691
26692SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26693 SDValue Op, SelectionDAG &DAG) const {
26694 EVT VT = Op.getValueType();
26695 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26696
26697 SDLoc DL(Op);
26698 SDValue Val = Op.getOperand(0);
26699 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26700 Val = convertToScalableVector(DAG, ContainerVT, Val);
26701
26702 // Repeatedly truncate Val until the result is of the desired element type.
26703 switch (ContainerVT.getSimpleVT().SimpleTy) {
26704 default:
26705 llvm_unreachable("unimplemented container type");
26706 case MVT::nxv2i64:
26707 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26708 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26709 if (VT.getVectorElementType() == MVT::i32)
26710 break;
26711 [[fallthrough]];
26712 case MVT::nxv4i32:
26713 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26714 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26715 if (VT.getVectorElementType() == MVT::i16)
26716 break;
26717 [[fallthrough]];
26718 case MVT::nxv8i16:
26719 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26720 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26721 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26722 break;
26723 }
26724
26725 return convertFromScalableVector(DAG, VT, Val);
26726}
26727
26728SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26729 SDValue Op, SelectionDAG &DAG) const {
26730 EVT VT = Op.getValueType();
26731 EVT InVT = Op.getOperand(0).getValueType();
26732 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26733
26734 SDLoc DL(Op);
26735 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26736 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26737
26738 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26739}
26740
26741SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26742 SDValue Op, SelectionDAG &DAG) const {
26743 EVT VT = Op.getValueType();
26744 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26745
26746 SDLoc DL(Op);
26747 EVT InVT = Op.getOperand(0).getValueType();
26748 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26749 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26750
26751 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26752 Op.getOperand(1), Op.getOperand(2));
26753
26754 return convertFromScalableVector(DAG, VT, ScalableRes);
26755}
26756
26757// Convert vector operation 'Op' to an equivalent predicated operation whereby
26758// the original operation's type is used to construct a suitable predicate.
26759// NOTE: The results for inactive lanes are undefined.
26760SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26761 SelectionDAG &DAG,
26762 unsigned NewOp) const {
26763 EVT VT = Op.getValueType();
26764 SDLoc DL(Op);
26765 auto Pg = getPredicateForVector(DAG, DL, VT);
26766
26767 if (VT.isFixedLengthVector()) {
26768 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26769 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26770
26771 // Create list of operands by converting existing ones to scalable types.
26773 for (const SDValue &V : Op->op_values()) {
26774 if (isa<CondCodeSDNode>(V)) {
26775 Operands.push_back(V);
26776 continue;
26777 }
26778
26779 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26780 EVT VTArg = VTNode->getVT().getVectorElementType();
26781 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26782 Operands.push_back(DAG.getValueType(NewVTArg));
26783 continue;
26784 }
26785
26786 assert(isTypeLegal(V.getValueType()) &&
26787 "Expected only legal fixed-width types");
26788 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26789 }
26790
26791 if (isMergePassthruOpcode(NewOp))
26792 Operands.push_back(DAG.getUNDEF(ContainerVT));
26793
26794 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26795 return convertFromScalableVector(DAG, VT, ScalableRes);
26796 }
26797
26798 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26799
26801 for (const SDValue &V : Op->op_values()) {
26802 assert((!V.getValueType().isVector() ||
26803 V.getValueType().isScalableVector()) &&
26804 "Only scalable vectors are supported!");
26805 Operands.push_back(V);
26806 }
26807
26808 if (isMergePassthruOpcode(NewOp))
26809 Operands.push_back(DAG.getUNDEF(VT));
26810
26811 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26812}
26813
26814// If a fixed length vector operation has no side effects when applied to
26815// undefined elements, we can safely use scalable vectors to perform the same
26816// operation without needing to worry about predication.
26817SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26818 SelectionDAG &DAG) const {
26819 EVT VT = Op.getValueType();
26821 "Only expected to lower fixed length vector operation!");
26822 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26823
26824 // Create list of operands by converting existing ones to scalable types.
26826 for (const SDValue &V : Op->op_values()) {
26827 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26828
26829 // Pass through non-vector operands.
26830 if (!V.getValueType().isVector()) {
26831 Ops.push_back(V);
26832 continue;
26833 }
26834
26835 // "cast" fixed length vector to a scalable vector.
26836 assert(V.getValueType().isFixedLengthVector() &&
26837 isTypeLegal(V.getValueType()) &&
26838 "Only fixed length vectors are supported!");
26839 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26840 }
26841
26842 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26843 return convertFromScalableVector(DAG, VT, ScalableRes);
26844}
26845
26846SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26847 SelectionDAG &DAG) const {
26848 SDLoc DL(ScalarOp);
26849 SDValue AccOp = ScalarOp.getOperand(0);
26850 SDValue VecOp = ScalarOp.getOperand(1);
26851 EVT SrcVT = VecOp.getValueType();
26852 EVT ResVT = SrcVT.getVectorElementType();
26853
26854 EVT ContainerVT = SrcVT;
26855 if (SrcVT.isFixedLengthVector()) {
26856 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26857 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26858 }
26859
26860 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26861 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26862
26863 // Convert operands to Scalable.
26864 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26865 DAG.getUNDEF(ContainerVT), AccOp, Zero);
26866
26867 // Perform reduction.
26868 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26869 Pg, AccOp, VecOp);
26870
26871 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26872}
26873
26874SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26875 SelectionDAG &DAG) const {
26876 SDLoc DL(ReduceOp);
26877 SDValue Op = ReduceOp.getOperand(0);
26878 EVT OpVT = Op.getValueType();
26879 EVT VT = ReduceOp.getValueType();
26880
26881 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26882 return SDValue();
26883
26884 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26885
26886 switch (ReduceOp.getOpcode()) {
26887 default:
26888 return SDValue();
26889 case ISD::VECREDUCE_OR:
26890 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26891 // The predicate can be 'Op' because
26892 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26893 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26894 else
26895 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26896 case ISD::VECREDUCE_AND: {
26897 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26898 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26899 }
26900 case ISD::VECREDUCE_XOR: {
26901 SDValue ID =
26902 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26903 if (OpVT == MVT::nxv1i1) {
26904 // Emulate a CNTP on .Q using .D and a different governing predicate.
26905 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26906 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26907 }
26908 SDValue Cntp =
26909 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26910 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
26911 }
26912 }
26913
26914 return SDValue();
26915}
26916
26917SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26918 SDValue ScalarOp,
26919 SelectionDAG &DAG) const {
26920 SDLoc DL(ScalarOp);
26921 SDValue VecOp = ScalarOp.getOperand(0);
26922 EVT SrcVT = VecOp.getValueType();
26923
26925 SrcVT,
26926 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
26927 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26928 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26929 }
26930
26931 // UADDV always returns an i64 result.
26932 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26933 SrcVT.getVectorElementType();
26934 EVT RdxVT = SrcVT;
26935 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
26936 RdxVT = getPackedSVEVectorVT(ResVT);
26937
26938 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26939 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
26940 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26941 Rdx, DAG.getConstant(0, DL, MVT::i64));
26942
26943 // The VEC_REDUCE nodes expect an element size result.
26944 if (ResVT != ScalarOp.getValueType())
26945 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
26946
26947 return Res;
26948}
26949
26950SDValue
26951AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
26952 SelectionDAG &DAG) const {
26953 EVT VT = Op.getValueType();
26954 SDLoc DL(Op);
26955
26956 EVT InVT = Op.getOperand(1).getValueType();
26957 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26958 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
26959 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
26960
26961 // Convert the mask to a predicated (NOTE: We don't need to worry about
26962 // inactive lanes since VSELECT is safe when given undefined elements).
26963 EVT MaskVT = Op.getOperand(0).getValueType();
26964 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
26965 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
26967 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
26968
26969 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
26970 Mask, Op1, Op2);
26971
26972 return convertFromScalableVector(DAG, VT, ScalableRes);
26973}
26974
26975SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
26976 SDValue Op, SelectionDAG &DAG) const {
26977 SDLoc DL(Op);
26978 EVT InVT = Op.getOperand(0).getValueType();
26979 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26980
26981 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
26982 "Only expected to lower fixed length vector operation!");
26983 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
26984 "Expected integer result of the same bit length as the inputs!");
26985
26986 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26987 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
26988 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26989
26990 EVT CmpVT = Pg.getValueType();
26991 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
26992 {Pg, Op1, Op2, Op.getOperand(2)});
26993
26994 EVT PromoteVT = ContainerVT.changeTypeToInteger();
26995 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
26996 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
26997}
26998
26999SDValue
27000AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27001 SelectionDAG &DAG) const {
27002 SDLoc DL(Op);
27003 auto SrcOp = Op.getOperand(0);
27004 EVT VT = Op.getValueType();
27005 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27006 EVT ContainerSrcVT =
27007 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27008
27009 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27010 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27011 return convertFromScalableVector(DAG, VT, Op);
27012}
27013
27014SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27015 SDValue Op, SelectionDAG &DAG) const {
27016 SDLoc DL(Op);
27017 unsigned NumOperands = Op->getNumOperands();
27018
27019 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27020 "Unexpected number of operands in CONCAT_VECTORS");
27021
27022 auto SrcOp1 = Op.getOperand(0);
27023 auto SrcOp2 = Op.getOperand(1);
27024 EVT VT = Op.getValueType();
27025 EVT SrcVT = SrcOp1.getValueType();
27026
27027 if (NumOperands > 2) {
27029 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27030 for (unsigned I = 0; I < NumOperands; I += 2)
27031 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27032 Op->getOperand(I), Op->getOperand(I + 1)));
27033
27034 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27035 }
27036
27037 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27038
27040 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27041 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27042
27043 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27044
27045 return convertFromScalableVector(DAG, VT, Op);
27046}
27047
27048SDValue
27049AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27050 SelectionDAG &DAG) const {
27051 EVT VT = Op.getValueType();
27052 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27053
27054 SDLoc DL(Op);
27055 SDValue Val = Op.getOperand(0);
27056 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27057 EVT SrcVT = Val.getValueType();
27058 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27059 EVT ExtendVT = ContainerVT.changeVectorElementType(
27060 SrcVT.getVectorElementType());
27061
27062 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27063 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27064
27065 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27066 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27067 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27068 Pg, Val, DAG.getUNDEF(ContainerVT));
27069
27070 return convertFromScalableVector(DAG, VT, Val);
27071}
27072
27073SDValue
27074AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27075 SelectionDAG &DAG) const {
27076 EVT VT = Op.getValueType();
27077 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27078
27079 SDLoc DL(Op);
27080 SDValue Val = Op.getOperand(0);
27081 EVT SrcVT = Val.getValueType();
27082 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27083 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27085 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27086
27087 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27088 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27089 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27090 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27091 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27092
27093 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27094 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27095}
27096
27097SDValue
27098AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27099 SelectionDAG &DAG) const {
27100 EVT VT = Op.getValueType();
27101 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27102
27103 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27104 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27106
27107 SDLoc DL(Op);
27108 SDValue Val = Op.getOperand(0);
27109 EVT SrcVT = Val.getValueType();
27110 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27111 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27112
27113 if (VT.bitsGE(SrcVT)) {
27115
27116 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27117 VT.changeTypeToInteger(), Val);
27118
27119 // Safe to use a larger than specified operand because by promoting the
27120 // value nothing has changed from an arithmetic point of view.
27121 Val =
27122 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27123 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27124 DAG.getUNDEF(ContainerDstVT));
27125 return convertFromScalableVector(DAG, VT, Val);
27126 } else {
27127 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27128 ContainerDstVT.getVectorElementType());
27130
27131 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27132 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27133 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27134 Val = convertFromScalableVector(DAG, SrcVT, Val);
27135
27136 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27137 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27138 }
27139}
27140
27141SDValue
27142AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27143 SelectionDAG &DAG) const {
27144 SDLoc DL(Op);
27145 EVT OpVT = Op.getValueType();
27146 assert(OpVT.isScalableVector() &&
27147 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27148 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27149 Op.getOperand(1));
27150 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27151 Op.getOperand(1));
27152 return DAG.getMergeValues({Even, Odd}, DL);
27153}
27154
27155SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27156 SelectionDAG &DAG) const {
27157 SDLoc DL(Op);
27158 EVT OpVT = Op.getValueType();
27159 assert(OpVT.isScalableVector() &&
27160 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27161
27162 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27163 Op.getOperand(1));
27164 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27165 Op.getOperand(1));
27166 return DAG.getMergeValues({Lo, Hi}, DL);
27167}
27168
27169SDValue
27170AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27171 SelectionDAG &DAG) const {
27172 EVT VT = Op.getValueType();
27173 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27174
27175 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27176 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27178
27179 SDLoc DL(Op);
27180 SDValue Val = Op.getOperand(0);
27181 EVT SrcVT = Val.getValueType();
27182 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27183 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27184
27185 if (VT.bitsGT(SrcVT)) {
27186 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27187 ContainerSrcVT.getVectorElementType());
27189
27190 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27191 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27192
27193 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27194 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27195 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27196 DAG.getUNDEF(ContainerDstVT));
27197 return convertFromScalableVector(DAG, VT, Val);
27198 } else {
27199 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27201
27202 // Safe to use a larger than specified result since an fp_to_int where the
27203 // result doesn't fit into the destination is undefined.
27204 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27205 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27206 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27207
27208 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27209 }
27210}
27211
27213 ArrayRef<int> ShuffleMask, EVT VT,
27214 EVT ContainerVT, SelectionDAG &DAG) {
27215 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27216 SDLoc DL(Op);
27217 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27218 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27219 bool IsSingleOp =
27220 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27221
27222 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27223 MinSVESize = 128;
27224
27225 // Ignore two operands if no SVE2 or all index numbers couldn't
27226 // be represented.
27227 if (!IsSingleOp && !Subtarget.hasSVE2())
27228 return SDValue();
27229
27230 EVT VTOp1 = Op.getOperand(0).getValueType();
27231 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27232 unsigned IndexLen = MinSVESize / BitsPerElt;
27233 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27234 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27235 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27236 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27237 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27238 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27239 "Incorrectly legalised shuffle operation");
27240
27242 // If MinSVESize is not equal to MaxSVESize then we need to know which
27243 // TBL mask element needs adjustment.
27244 SmallVector<SDValue, 8> AddRuntimeVLMask;
27245
27246 // Bail out for 8-bits element types, because with 2048-bit SVE register
27247 // size 8 bits is only sufficient to index into the first source vector.
27248 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27249 return SDValue();
27250
27251 for (int Index : ShuffleMask) {
27252 // Handling poison index value.
27253 if (Index < 0)
27254 Index = 0;
27255 // If the mask refers to elements in the second operand, then we have to
27256 // offset the index by the number of elements in a vector. If this is number
27257 // is not known at compile-time, we need to maintain a mask with 'VL' values
27258 // to add at runtime.
27259 if ((unsigned)Index >= ElementsPerVectorReg) {
27260 if (MinMaxEqual) {
27261 Index += IndexLen - ElementsPerVectorReg;
27262 } else {
27263 Index = Index - ElementsPerVectorReg;
27264 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27265 }
27266 } else if (!MinMaxEqual)
27267 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27268 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27269 // to 255, this might point to the last element of in the second operand
27270 // of the shufflevector, thus we are rejecting this transform.
27271 if ((unsigned)Index >= MaxOffset)
27272 return SDValue();
27273 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27274 }
27275
27276 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27277 // value where it would perform first lane duplication for out of
27278 // index elements. For i8 elements an out-of-range index could be a valid
27279 // for 2048-bit vector register size.
27280 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27281 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27282 if (!MinMaxEqual)
27283 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27284 }
27285
27286 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27287 SDValue VecMask =
27288 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27289 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27290
27291 SDValue Shuffle;
27292 if (IsSingleOp)
27293 Shuffle =
27294 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27295 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27296 Op1, SVEMask);
27297 else if (Subtarget.hasSVE2()) {
27298 if (!MinMaxEqual) {
27299 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27300 SDValue VScale = (BitsPerElt == 64)
27301 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27302 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27303 SDValue VecMask =
27304 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27305 SDValue MulByMask = DAG.getNode(
27306 ISD::MUL, DL, MaskType,
27307 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27308 DAG.getBuildVector(MaskType, DL,
27309 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27310 SDValue UpdatedVecMask =
27311 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27312 SVEMask = convertToScalableVector(
27313 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27314 }
27315 Shuffle =
27316 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27317 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27318 Op1, Op2, SVEMask);
27319 }
27320 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27321 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27322}
27323
27324SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27325 SDValue Op, SelectionDAG &DAG) const {
27326 EVT VT = Op.getValueType();
27327 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27328
27329 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27330 auto ShuffleMask = SVN->getMask();
27331
27332 SDLoc DL(Op);
27333 SDValue Op1 = Op.getOperand(0);
27334 SDValue Op2 = Op.getOperand(1);
27335
27336 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27337 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27338 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27339
27340 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27341 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27342 return MVT::i32;
27343 return ScalarTy;
27344 };
27345
27346 if (SVN->isSplat()) {
27347 unsigned Lane = std::max(0, SVN->getSplatIndex());
27348 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27349 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27350 DAG.getConstant(Lane, DL, MVT::i64));
27351 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27352 return convertFromScalableVector(DAG, VT, Op);
27353 }
27354
27355 bool ReverseEXT = false;
27356 unsigned Imm;
27357 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27358 Imm == VT.getVectorNumElements() - 1) {
27359 if (ReverseEXT)
27360 std::swap(Op1, Op2);
27361 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27362 SDValue Scalar = DAG.getNode(
27363 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27364 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27365 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27366 return convertFromScalableVector(DAG, VT, Op);
27367 }
27368
27369 for (unsigned LaneSize : {64U, 32U, 16U}) {
27370 if (isREVMask(ShuffleMask, VT, LaneSize)) {
27371 EVT NewVT =
27373 unsigned RevOp;
27374 unsigned EltSz = VT.getScalarSizeInBits();
27375 if (EltSz == 8)
27377 else if (EltSz == 16)
27379 else
27381
27382 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27383 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27384 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27385 return convertFromScalableVector(DAG, VT, Op);
27386 }
27387 }
27388
27389 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
27390 isREVMask(ShuffleMask, VT, 128)) {
27391 if (!VT.isFloatingPoint())
27392 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27393
27395 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27396 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27397 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27398 return convertFromScalableVector(DAG, VT, Op);
27399 }
27400
27401 unsigned WhichResult;
27402 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27404 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27405
27406 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
27407 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27409 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27410 }
27411
27412 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27414 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27415
27416 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27417 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27419 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27420 }
27421
27422 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27423 // represents the same logical operation as performed by a ZIP instruction. In
27424 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27425 // equivalent to an AArch64 instruction. There's the extra component of
27426 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27427 // only operated on 64/128bit vector types that have a direct mapping to a
27428 // target register and so an exact mapping is implied.
27429 // However, when using SVE for fixed length vectors, most legal vector types
27430 // are actually sub-vectors of a larger SVE register. When mapping
27431 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27432 // how the mask's indices translate. Specifically, when the mapping requires
27433 // an exact meaning for a specific vector index (e.g. Index X is the last
27434 // vector element in the register) then such mappings are often only safe when
27435 // the exact SVE register size is know. The main exception to this is when
27436 // indices are logically relative to the first element of either
27437 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27438 // when converting from fixed-length to scalable vector types (i.e. the start
27439 // of a fixed length vector is always the start of a scalable vector).
27440 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27441 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27442 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27443 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27444 Op2.isUndef()) {
27445 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27446 return convertFromScalableVector(DAG, VT, Op);
27447 }
27448
27449 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27451 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27452
27453 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
27454 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27456 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27457 }
27458
27459 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27461 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27462
27463 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27464 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27466 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27467 }
27468 }
27469
27470 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27471 // unless NEON is not available and we can assume minimal SVE register size is
27472 // 128-bits.
27473 if (MinSVESize || !Subtarget->isNeonAvailable())
27474 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27475 DAG);
27476
27477 return SDValue();
27478}
27479
27480SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27481 SelectionDAG &DAG) const {
27482 SDLoc DL(Op);
27483 EVT InVT = Op.getValueType();
27484
27485 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27486 InVT.isScalableVector() && isTypeLegal(InVT) &&
27487 "Only expect to cast between legal scalable vector types!");
27488 assert(VT.getVectorElementType() != MVT::i1 &&
27489 InVT.getVectorElementType() != MVT::i1 &&
27490 "For predicate bitcasts, use getSVEPredicateBitCast");
27491
27492 if (InVT == VT)
27493 return Op;
27494
27496 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27497
27498 // Safe bitcasting between unpacked vector types of different element counts
27499 // is currently unsupported because the following is missing the necessary
27500 // work to ensure the result's elements live where they're supposed to within
27501 // an SVE register.
27502 // 01234567
27503 // e.g. nxv2i32 = XX??XX??
27504 // nxv4f16 = X?X?X?X?
27506 VT == PackedVT || InVT == PackedInVT) &&
27507 "Unexpected bitcast!");
27508
27509 // Pack input if required.
27510 if (InVT != PackedInVT)
27511 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27512
27513 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27514
27515 // Unpack result if required.
27516 if (VT != PackedVT)
27518
27519 return Op;
27520}
27521
27523 SDValue N) const {
27524 return ::isAllActivePredicate(DAG, N);
27525}
27526
27528 return ::getPromotedVTForPredicate(VT);
27529}
27530
27531bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27532 SDValue Op, const APInt &OriginalDemandedBits,
27533 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27534 unsigned Depth) const {
27535
27536 unsigned Opc = Op.getOpcode();
27537 switch (Opc) {
27538 case AArch64ISD::VSHL: {
27539 // Match (VSHL (VLSHR Val X) X)
27540 SDValue ShiftL = Op;
27541 SDValue ShiftR = Op->getOperand(0);
27542 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27543 return false;
27544
27545 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27546 return false;
27547
27548 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27549 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27550
27551 // Other cases can be handled as well, but this is not
27552 // implemented.
27553 if (ShiftRBits != ShiftLBits)
27554 return false;
27555
27556 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27557 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27558
27559 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27560 APInt UnusedBits = ~OriginalDemandedBits;
27561
27562 if ((ZeroBits & UnusedBits) != ZeroBits)
27563 return false;
27564
27565 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27566 // used - simplify to just Val.
27567 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27568 }
27569 case AArch64ISD::BICi: {
27570 // Fold BICi if all destination bits already known to be zeroed
27571 SDValue Op0 = Op.getOperand(0);
27572 KnownBits KnownOp0 =
27573 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
27574 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27575 uint64_t BitsToClear = Op->getConstantOperandVal(1)
27576 << Op->getConstantOperandVal(2);
27577 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27578 if (APInt(Known.getBitWidth(), BitsToClear)
27579 .isSubsetOf(AlreadyZeroedBitsToClear))
27580 return TLO.CombineTo(Op, Op0);
27581
27582 Known = KnownOp0 &
27583 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
27584
27585 return false;
27586 }
27588 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27589 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27590 if (!MaxSVEVectorSizeInBits)
27591 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27592 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27593 // The SVE count intrinsics don't support the multiplier immediate so we
27594 // don't have to account for that here. The value returned may be slightly
27595 // over the true required bits, as this is based on the "ALL" pattern. The
27596 // other patterns are also exposed by these intrinsics, but they all
27597 // return a value that's strictly less than "ALL".
27598 unsigned RequiredBits = llvm::bit_width(MaxElements);
27599 unsigned BitWidth = Known.Zero.getBitWidth();
27600 if (RequiredBits < BitWidth)
27601 Known.Zero.setHighBits(BitWidth - RequiredBits);
27602 return false;
27603 }
27604 }
27605 }
27606
27608 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27609}
27610
27611bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27612 return Op.getOpcode() == AArch64ISD::DUP ||
27613 Op.getOpcode() == AArch64ISD::MOVI ||
27614 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27615 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27617}
27618
27620 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27621 Subtarget->hasComplxNum();
27622}
27623
27626 auto *VTy = dyn_cast<VectorType>(Ty);
27627 if (!VTy)
27628 return false;
27629
27630 // If the vector is scalable, SVE is enabled, implying support for complex
27631 // numbers. Otherwise, we need to ensure complex number support is available
27632 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27633 return false;
27634
27635 auto *ScalarTy = VTy->getScalarType();
27636 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27637
27638 // We can only process vectors that have a bit size of 128 or higher (with an
27639 // additional 64 bits for Neon). Additionally, these vectors must have a
27640 // power-of-2 size, as we later split them into the smallest supported size
27641 // and merging them back together after applying complex operation.
27642 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27643 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27644 !llvm::isPowerOf2_32(VTyWidth))
27645 return false;
27646
27647 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27648 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27649 return 8 <= ScalarWidth && ScalarWidth <= 64;
27650 }
27651
27652 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27653 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27654}
27655
27658 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27659 Value *Accumulator) const {
27660 VectorType *Ty = cast<VectorType>(InputA->getType());
27661 bool IsScalable = Ty->isScalableTy();
27662 bool IsInt = Ty->getElementType()->isIntegerTy();
27663
27664 unsigned TyWidth =
27666
27667 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27668 "Vector type must be either 64 or a power of 2 that is at least 128");
27669
27670 if (TyWidth > 128) {
27671 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27672 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27673 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27674 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27675 auto *UpperSplitA =
27676 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27677 auto *UpperSplitB =
27678 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27679 Value *LowerSplitAcc = nullptr;
27680 Value *UpperSplitAcc = nullptr;
27681 if (Accumulator) {
27682 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27683 UpperSplitAcc =
27684 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27685 }
27686 auto *LowerSplitInt = createComplexDeinterleavingIR(
27687 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27688 auto *UpperSplitInt = createComplexDeinterleavingIR(
27689 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27690
27691 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27692 B.getInt64(0));
27693 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27694 }
27695
27696 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27697 if (Accumulator == nullptr)
27699
27700 if (IsScalable) {
27701 if (IsInt)
27702 return B.CreateIntrinsic(
27703 Intrinsic::aarch64_sve_cmla_x, Ty,
27704 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27705
27706 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27707 return B.CreateIntrinsic(
27708 Intrinsic::aarch64_sve_fcmla, Ty,
27709 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27710 }
27711
27712 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27713 Intrinsic::aarch64_neon_vcmla_rot90,
27714 Intrinsic::aarch64_neon_vcmla_rot180,
27715 Intrinsic::aarch64_neon_vcmla_rot270};
27716
27717
27718 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27719 {Accumulator, InputA, InputB});
27720 }
27721
27722 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27723 if (IsScalable) {
27726 if (IsInt)
27727 return B.CreateIntrinsic(
27728 Intrinsic::aarch64_sve_cadd_x, Ty,
27729 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27730
27731 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27732 return B.CreateIntrinsic(
27733 Intrinsic::aarch64_sve_fcadd, Ty,
27734 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27735 }
27736 return nullptr;
27737 }
27738
27741 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27743 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27744
27745 if (IntId == Intrinsic::not_intrinsic)
27746 return nullptr;
27747
27748 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27749 }
27750
27751 return nullptr;
27752}
27753
27754bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27755 unsigned Opc = N->getOpcode();
27756 if (ISD::isExtOpcode(Opc)) {
27757 if (any_of(N->uses(),
27758 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27759 return false;
27760 }
27761 return true;
27762}
27763
27764unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27765 return Subtarget->getMinimumJumpTableEntries();
27766}
27767
27770 EVT VT) const {
27771 bool NonUnitFixedLengthVector =
27773 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27775
27776 EVT VT1;
27777 MVT RegisterVT;
27778 unsigned NumIntermediates;
27779 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27780 RegisterVT);
27781 return RegisterVT;
27782}
27783
27785 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27786 bool NonUnitFixedLengthVector =
27788 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27790
27791 EVT VT1;
27792 MVT VT2;
27793 unsigned NumIntermediates;
27795 NumIntermediates, VT2);
27796}
27797
27799 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27800 unsigned &NumIntermediates, MVT &RegisterVT) const {
27802 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27803 if (!RegisterVT.isFixedLengthVector() ||
27804 RegisterVT.getFixedSizeInBits() <= 128)
27805 return NumRegs;
27806
27807 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27808 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27809 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27810
27811 // A size mismatch here implies either type promotion or widening and would
27812 // have resulted in scalarisation if larger vectors had not be available.
27813 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27814 EVT EltTy = VT.getVectorElementType();
27816 if (!isTypeLegal(NewVT))
27817 NewVT = EltTy;
27818
27819 IntermediateVT = NewVT;
27820 NumIntermediates = VT.getVectorNumElements();
27821 RegisterVT = getRegisterType(Context, NewVT);
27822 return NumIntermediates;
27823 }
27824
27825 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27826 // types for vector arguments and returns.
27827
27828 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27829 NumIntermediates *= NumSubRegs;
27830 NumRegs *= NumSubRegs;
27831
27832 switch (RegisterVT.getVectorElementType().SimpleTy) {
27833 default:
27834 llvm_unreachable("unexpected element type for vector");
27835 case MVT::i8:
27836 IntermediateVT = RegisterVT = MVT::v16i8;
27837 break;
27838 case MVT::i16:
27839 IntermediateVT = RegisterVT = MVT::v8i16;
27840 break;
27841 case MVT::i32:
27842 IntermediateVT = RegisterVT = MVT::v4i32;
27843 break;
27844 case MVT::i64:
27845 IntermediateVT = RegisterVT = MVT::v2i64;
27846 break;
27847 case MVT::f16:
27848 IntermediateVT = RegisterVT = MVT::v8f16;
27849 break;
27850 case MVT::f32:
27851 IntermediateVT = RegisterVT = MVT::v4f32;
27852 break;
27853 case MVT::f64:
27854 IntermediateVT = RegisterVT = MVT::v2f64;
27855 break;
27856 case MVT::bf16:
27857 IntermediateVT = RegisterVT = MVT::v8bf16;
27858 break;
27859 }
27860
27861 return NumRegs;
27862}
27863
27865 const MachineFunction &MF) const {
27866 return !Subtarget->isTargetWindows() &&
27867 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27868}
27869
27870#ifndef NDEBUG
27872 switch (N->getOpcode()) {
27873 default:
27874 break;
27878 case AArch64ISD::UUNPKHI: {
27879 assert(N->getNumValues() == 1 && "Expected one result!");
27880 assert(N->getNumOperands() == 1 && "Expected one operand!");
27881 EVT VT = N->getValueType(0);
27882 EVT OpVT = N->getOperand(0).getValueType();
27883 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
27884 VT.isInteger() && "Expected integer vectors!");
27885 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
27886 "Expected vectors of equal size!");
27887 // TODO: Enable assert once bogus creations have been fixed.
27888 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
27889 // "Expected result vector with half the lanes of its input!");
27890 break;
27891 }
27892 case AArch64ISD::TRN1:
27893 case AArch64ISD::TRN2:
27894 case AArch64ISD::UZP1:
27895 case AArch64ISD::UZP2:
27896 case AArch64ISD::ZIP1:
27897 case AArch64ISD::ZIP2: {
27898 assert(N->getNumValues() == 1 && "Expected one result!");
27899 assert(N->getNumOperands() == 2 && "Expected two operands!");
27900 EVT VT = N->getValueType(0);
27901 EVT Op0VT = N->getOperand(0).getValueType();
27902 EVT Op1VT = N->getOperand(1).getValueType();
27903 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
27904 "Expected vectors!");
27905 // TODO: Enable assert once bogus creations have been fixed.
27906 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
27907 break;
27908 }
27909 }
27910}
27911#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
bool empty() const
Definition: Function.h:805
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:201
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:678
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:263
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1903
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:339
arg_iterator arg_end()
Definition: Function.h:823
arg_iterator arg_begin()
Definition: Function.h:814
size_t size() const
Definition: Function.h:804
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1037
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2472
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2067
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1437
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:476
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2081
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1416
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:82
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:86
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:690
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:721
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:473
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:477
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:731
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:827
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:471
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:479
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:658
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:472
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:772
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:675
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:767
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:468
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:798
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:844
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:484
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:738
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:553
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:456
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:670
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:662
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:629
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1132
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1128
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:476
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1345
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1376
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1161
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1247
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1037
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:436
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1361
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1365
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1375
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:477
@ STRICT_FLOG2
Definition: ISDOpcodes.h:421
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1273
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1274
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:411
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1406
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:885
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:662
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:450
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1358
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1227
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1362
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:994
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:758
@ STRICT_LROUND
Definition: ISDOpcodes.h:431
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1083
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:327
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1058
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1062
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:586
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:646
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ STRICT_FPOWI
Definition: ISDOpcodes.h:413
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1243
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1377
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:627
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1157
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1370
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1272
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1271
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:435
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:424
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1217
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:856
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:425
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1335
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1254
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1221
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1047
@ STRICT_LRINT
Definition: ISDOpcodes.h:433
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:591
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ STRICT_FROUND
Definition: ISDOpcodes.h:428
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:449
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1378
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1269
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:990
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1270
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1188
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1214
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:657
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:400
@ STRICT_FLOG10
Definition: ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:434
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:612
@ STRICT_FEXP2
Definition: ISDOpcodes.h:418
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1268
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:432
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:855
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1366
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1152
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1076
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:580
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:313
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1600
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1491
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1478
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1529
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1509
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1480
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1465
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:771
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:830
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:293
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1507
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:434
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:376
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:291
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64