LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
845
847 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
850 } else {
853 }
856
857 // Generate outline atomics library calls only if LSE was not specified for
858 // subtarget
859 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
885#define LCALLNAMES(A, B, N) \
886 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
887 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
888 setLibcallName(A##N##_REL, #B #N "_rel"); \
889 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
890#define LCALLNAME4(A, B) \
891 LCALLNAMES(A, B, 1) \
892 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
893#define LCALLNAME5(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) \
896 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
897 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
898 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
899 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
900 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
903#undef LCALLNAMES
904#undef LCALLNAME4
905#undef LCALLNAME5
906 }
907
908 if (Subtarget->hasLSE128()) {
909 // Custom lowering because i128 is not legal. Must be replaced by 2x64
910 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
914 }
915
916 // 128-bit loads and stores can be done without expanding
919
920 // Aligned 128-bit loads and stores are single-copy atomic according to the
921 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
922 if (Subtarget->hasLSE2()) {
925 }
926
927 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
928 // custom lowering, as there are no un-paired non-temporal stores and
929 // legalization will break up 256 bit inputs.
931 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
932 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
933 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
938
939 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
940 // custom lowering, as there are no un-paired non-temporal loads legalization
941 // will break up 256 bit inputs.
942 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
943 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
944 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
945 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
946 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
947 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
948 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
949 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
950
951 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
953
954 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
955 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
956 // Issue __sincos_stret if available.
959 } else {
962 }
963
964 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
965 // MSVCRT doesn't have powi; fall back to pow
966 setLibcallName(RTLIB::POWI_F32, nullptr);
967 setLibcallName(RTLIB::POWI_F64, nullptr);
968 }
969
970 // Make floating-point constants legal for the large code model, so they don't
971 // become loads from the constant pool.
972 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
975 }
976
977 // AArch64 does not have floating-point extending loads, i1 sign-extending
978 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
979 for (MVT VT : MVT::fp_valuetypes()) {
980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
981 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
982 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
985 }
986 for (MVT VT : MVT::integer_valuetypes())
988
989 for (MVT WideVT : MVT::fp_valuetypes()) {
990 for (MVT NarrowVT : MVT::fp_valuetypes()) {
991 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
992 setTruncStoreAction(WideVT, NarrowVT, Expand);
993 }
994 }
995 }
996
997 if (Subtarget->hasFPARMv8()) {
1001 }
1002
1003 // Indexed loads and stores are supported.
1004 for (unsigned im = (unsigned)ISD::PRE_INC;
1006 setIndexedLoadAction(im, MVT::i8, Legal);
1007 setIndexedLoadAction(im, MVT::i16, Legal);
1008 setIndexedLoadAction(im, MVT::i32, Legal);
1009 setIndexedLoadAction(im, MVT::i64, Legal);
1010 setIndexedLoadAction(im, MVT::f64, Legal);
1011 setIndexedLoadAction(im, MVT::f32, Legal);
1012 setIndexedLoadAction(im, MVT::f16, Legal);
1013 setIndexedLoadAction(im, MVT::bf16, Legal);
1014 setIndexedStoreAction(im, MVT::i8, Legal);
1015 setIndexedStoreAction(im, MVT::i16, Legal);
1016 setIndexedStoreAction(im, MVT::i32, Legal);
1017 setIndexedStoreAction(im, MVT::i64, Legal);
1018 setIndexedStoreAction(im, MVT::f64, Legal);
1019 setIndexedStoreAction(im, MVT::f32, Legal);
1020 setIndexedStoreAction(im, MVT::f16, Legal);
1021 setIndexedStoreAction(im, MVT::bf16, Legal);
1022 }
1023
1024 // Trap.
1025 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1028
1029 // We combine OR nodes for bitfield operations.
1031 // Try to create BICs for vector ANDs.
1033
1034 // Vector add and sub nodes may conceal a high-half opportunity.
1035 // Also, try to fold ADD into CSINC/CSINV..
1038
1041
1042 // Try and combine setcc with csel
1044
1046
1053
1055
1057
1059
1063
1065
1067
1069
1071
1075
1077
1078 // In case of strict alignment, avoid an excessive number of byte wide stores.
1081 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1082
1086 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1087
1090
1093 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1094
1096
1098
1099 EnableExtLdPromotion = true;
1100
1101 // Set required alignment.
1103 // Set preferred alignments.
1104
1105 // Don't align loops on Windows. The SEH unwind info generation needs to
1106 // know the exact length of functions before the alignments have been
1107 // expanded.
1108 if (!Subtarget->isTargetWindows())
1112
1113 // Only change the limit for entries in a jump table if specified by
1114 // the sub target, but not at the command line.
1115 unsigned MaxJT = STI.getMaximumJumpTableSize();
1116 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1118
1120
1122
1124
1125 if (Subtarget->hasNEON()) {
1126 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1127 // silliness like this:
1128 for (auto Op :
1146 setOperationAction(Op, MVT::v1f64, Expand);
1147
1148 for (auto Op :
1153 setOperationAction(Op, MVT::v1i64, Expand);
1154
1155 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1156 // elements smaller than i32, so promote the input to i32 first.
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1159
1160 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1161 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1162 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1165 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1167
1168 if (Subtarget->hasFullFP16()) {
1171
1180 } else {
1181 // when AArch64 doesn't have fullfp16 support, promote the input
1182 // to i32 first.
1183 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1184 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1185 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1188 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1191 }
1192
1193 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1194 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1201 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1206 }
1207
1208 // Custom handling for some quad-vector types to detect MULL.
1209 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1210 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1211 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1212 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1215
1216 // Saturates
1217 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1218 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1223 }
1224
1225 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1226 MVT::v4i32}) {
1233 }
1234
1235 // Vector reductions
1236 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1237 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1238 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1243
1245 }
1246 }
1247 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1248 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1257 }
1262
1264 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1265 // Likewise, narrowing and extending vector loads/stores aren't handled
1266 // directly.
1269
1270 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1273 } else {
1276 }
1279
1282
1283 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1284 setTruncStoreAction(VT, InnerVT, Expand);
1285 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1286 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1287 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1288 }
1289 }
1290
1291 // AArch64 has implementations of a lot of rounding-like FP operations.
1292 for (auto Op :
1297 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1299 if (Subtarget->hasFullFP16())
1300 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1302 }
1303
1304 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1305
1310
1314
1315 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1317 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1320 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1321
1322 // ADDP custom lowering
1323 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1325 // FADDP custom lowering
1326 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1328 }
1329
1330 if (Subtarget->hasSME()) {
1332 }
1333
1334 // FIXME: Move lowering for more nodes here if those are common between
1335 // SVE and SME.
1336 if (Subtarget->hasSVEorSME()) {
1337 for (auto VT :
1338 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1343 }
1344 }
1345
1346 if (Subtarget->hasSVEorSME()) {
1347 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1390
1396
1405
1410
1411 if (!Subtarget->isLittleEndian())
1413
1414 if (Subtarget->hasSVE2orSME())
1415 // For SLI/SRI.
1417 }
1418
1419 // Illegal unpacked integer vector types.
1420 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1423 }
1424
1425 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1426 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1427 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1429
1430 for (auto VT :
1431 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1432 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1434
1435 for (auto VT :
1436 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1444
1448
1449 // There are no legal MVT::nxv16f## based types.
1450 if (VT != MVT::nxv16i1) {
1453 }
1454 }
1455
1456 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1457 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1458 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1459 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1464 }
1465
1466 // Firstly, exclude all scalable vector extending loads/truncating stores,
1467 // include both integer and floating scalable vector.
1469 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1470 setTruncStoreAction(VT, InnerVT, Expand);
1471 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1472 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1473 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1474 }
1475 }
1476
1477 // Then, selectively enable those which we directly support.
1478 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1479 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1480 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1481 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1482 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1483 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1484 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1485 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1486 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1487 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1488 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1489 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1490 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1491 }
1492
1493 // SVE supports truncating stores of 64 and 128-bit vectors
1494 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1495 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1496 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1497 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1498 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1499
1500 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1501 MVT::nxv4f32, MVT::nxv2f64}) {
1537 if (Subtarget->isSVEAvailable())
1542
1556
1568
1569 if (!Subtarget->isLittleEndian())
1571 }
1572
1573 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1580
1581 if (!Subtarget->isLittleEndian())
1583 }
1584
1587
1588 // NEON doesn't support integer divides, but SVE does
1589 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1590 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1593 }
1594
1595 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1596 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1597 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1598
1599 if (Subtarget->isSVEAvailable()) {
1600 // NEON doesn't support across-vector reductions, but SVE does.
1601 for (auto VT :
1602 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1604 }
1605
1606 if (!Subtarget->isNeonAvailable()) {
1607 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Custom);
1608 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Custom);
1609 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Custom);
1610 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Custom);
1611 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Custom);
1612 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
1613 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
1614 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
1615 setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
1616 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
1617 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
1618 setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
1619 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
1620 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
1621 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1622 MVT::v4i32, MVT::v1i64, MVT::v2i64})
1623 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1624
1625 for (MVT VT :
1626 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1627 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1628 }
1629
1630 // NOTE: Currently this has to happen after computeRegisterProperties rather
1631 // than the preferred option of combining it with the addRegisterClass call.
1632 if (Subtarget->useSVEForFixedLengthVectors()) {
1635 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1638 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1639
1640 // 64bit results can mean a bigger than NEON input.
1641 for (auto VT : {MVT::v8i8, MVT::v4i16})
1644
1645 // 128bit results imply a bigger than NEON input.
1646 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1648 for (auto VT : {MVT::v8f16, MVT::v4f32})
1650
1651 // These operations are not supported on NEON but SVE can do them.
1653 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1654 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1655 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1656 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1657 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1658 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1659 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1660 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1661 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1662 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1663 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1664 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1665 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1666 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1667 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1672
1673 // Int operations with no NEON support.
1674 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1675 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1683 }
1684
1685
1686 // Use SVE for vectors with more than 2 elements.
1687 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1689 }
1690
1691 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1692 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1693 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1694 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1695
1697 }
1698
1699 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1700 // Only required for llvm.aarch64.mops.memset.tag
1702 }
1703
1705
1706 if (Subtarget->hasSVE()) {
1711 }
1712
1713 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1714
1715 IsStrictFPEnabled = true;
1717
1718 if (Subtarget->isWindowsArm64EC()) {
1719 // FIXME: are there other intrinsics we need to add here?
1720 setLibcallName(RTLIB::MEMCPY, "#memcpy");
1721 setLibcallName(RTLIB::MEMSET, "#memset");
1722 setLibcallName(RTLIB::MEMMOVE, "#memmove");
1723 setLibcallName(RTLIB::REM_F32, "#fmodf");
1724 setLibcallName(RTLIB::REM_F64, "#fmod");
1725 setLibcallName(RTLIB::FMA_F32, "#fmaf");
1726 setLibcallName(RTLIB::FMA_F64, "#fma");
1727 setLibcallName(RTLIB::SQRT_F32, "#sqrtf");
1728 setLibcallName(RTLIB::SQRT_F64, "#sqrt");
1729 setLibcallName(RTLIB::CBRT_F32, "#cbrtf");
1730 setLibcallName(RTLIB::CBRT_F64, "#cbrt");
1731 setLibcallName(RTLIB::LOG_F32, "#logf");
1732 setLibcallName(RTLIB::LOG_F64, "#log");
1733 setLibcallName(RTLIB::LOG2_F32, "#log2f");
1734 setLibcallName(RTLIB::LOG2_F64, "#log2");
1735 setLibcallName(RTLIB::LOG10_F32, "#log10f");
1736 setLibcallName(RTLIB::LOG10_F64, "#log10");
1737 setLibcallName(RTLIB::EXP_F32, "#expf");
1738 setLibcallName(RTLIB::EXP_F64, "#exp");
1739 setLibcallName(RTLIB::EXP2_F32, "#exp2f");
1740 setLibcallName(RTLIB::EXP2_F64, "#exp2");
1741 setLibcallName(RTLIB::EXP10_F32, "#exp10f");
1742 setLibcallName(RTLIB::EXP10_F64, "#exp10");
1743 setLibcallName(RTLIB::SIN_F32, "#sinf");
1744 setLibcallName(RTLIB::SIN_F64, "#sin");
1745 setLibcallName(RTLIB::COS_F32, "#cosf");
1746 setLibcallName(RTLIB::COS_F64, "#cos");
1747 setLibcallName(RTLIB::POW_F32, "#powf");
1748 setLibcallName(RTLIB::POW_F64, "#pow");
1749 setLibcallName(RTLIB::LDEXP_F32, "#ldexpf");
1750 setLibcallName(RTLIB::LDEXP_F64, "#ldexp");
1751 setLibcallName(RTLIB::FREXP_F32, "#frexpf");
1752 setLibcallName(RTLIB::FREXP_F64, "#frexp");
1753 }
1754}
1755
1756void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1757 assert(VT.isVector() && "VT should be a vector type");
1758
1759 if (VT.isFloatingPoint()) {
1761 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1762 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1763 }
1764
1765 // Mark vector float intrinsics as expand.
1766 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1776 }
1777
1778 // But we do support custom-lowering for FCOPYSIGN.
1779 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1780 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1781 VT == MVT::v8f16) &&
1782 Subtarget->hasFullFP16()))
1784
1797
1801 for (MVT InnerVT : MVT::all_valuetypes())
1802 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1803
1804 // CNT supports only B element sizes, then use UADDLP to widen.
1805 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1807
1813
1814 for (unsigned Opcode :
1817 setOperationAction(Opcode, VT, Custom);
1818
1819 if (!VT.isFloatingPoint())
1821
1822 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1823 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1824 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1825 setOperationAction(Opcode, VT, Legal);
1826
1827 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1828 // NEON types.
1829 if (VT.isFloatingPoint() &&
1830 VT.getVectorElementType() != MVT::bf16 &&
1831 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1832 for (unsigned Opcode :
1838 setOperationAction(Opcode, VT, Legal);
1839
1840 // Strict fp extend and trunc are legal
1841 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1843 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1845
1846 // FIXME: We could potentially make use of the vector comparison instructions
1847 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1848 // complications:
1849 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1850 // so we would need to expand when the condition code doesn't match the
1851 // kind of comparison.
1852 // * Some kinds of comparison require more than one FCMXY instruction so
1853 // would need to be expanded instead.
1854 // * The lowering of the non-strict versions involves target-specific ISD
1855 // nodes so we would likely need to add strict versions of all of them and
1856 // handle them appropriately.
1859
1860 if (Subtarget->isLittleEndian()) {
1861 for (unsigned im = (unsigned)ISD::PRE_INC;
1865 }
1866 }
1867
1868 if (Subtarget->hasD128()) {
1871 }
1872}
1873
1875 EVT OpVT) const {
1876 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1877 if (!Subtarget->hasSVE())
1878 return true;
1879
1880 // We can only support legal predicate result types. We can use the SVE
1881 // whilelo instruction for generating fixed-width predicates too.
1882 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1883 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1884 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1885 return true;
1886
1887 // The whilelo instruction only works with i32 or i64 scalar inputs.
1888 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1889 return true;
1890
1891 return false;
1892}
1893
1895 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1896}
1897
1898void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
1899 bool StreamingSVE) {
1900 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1901
1902 // By default everything must be expanded.
1903 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1905
1906 if (VT.isFloatingPoint()) {
1916 }
1917
1918 // Mark integer truncating stores/extending loads as having custom lowering
1919 if (VT.isInteger()) {
1920 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1921 while (InnerVT != VT) {
1922 setTruncStoreAction(VT, InnerVT, Custom);
1923 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1924 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1925 InnerVT = InnerVT.changeVectorElementType(
1926 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1927 }
1928 }
1929
1930 // Mark floating-point truncating stores/extending loads as having custom
1931 // lowering
1932 if (VT.isFloatingPoint()) {
1933 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1934 while (InnerVT != VT) {
1935 setTruncStoreAction(VT, InnerVT, Custom);
1936 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1937 InnerVT = InnerVT.changeVectorElementType(
1939 }
1940 }
1941
1942 // Lower fixed length vector operations to scalable equivalents.
1947 setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom);
1982 setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom);
1983 setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom);
1985 setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom);
2004 setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom);
2020 StreamingSVE ? Expand : Custom);
2031}
2032
2033void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
2034 addRegisterClass(VT, &AArch64::FPR64RegClass);
2035 addTypeForNEON(VT);
2036}
2037
2038void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2039 addRegisterClass(VT, &AArch64::FPR128RegClass);
2040 addTypeForNEON(VT);
2041}
2042
2044 LLVMContext &C, EVT VT) const {
2045 if (!VT.isVector())
2046 return MVT::i32;
2047 if (VT.isScalableVector())
2048 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2050}
2051
2052// isIntImmediate - This method tests to see if the node is a constant
2053// operand. If so Imm will receive the value.
2054static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2055 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2056 Imm = C->getZExtValue();
2057 return true;
2058 }
2059 return false;
2060}
2061
2062// isOpcWithIntImmediate - This method tests to see if the node is a specific
2063// opcode and that it has a immediate integer right operand.
2064// If so Imm will receive the value.
2065static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2066 uint64_t &Imm) {
2067 return N->getOpcode() == Opc &&
2068 isIntImmediate(N->getOperand(1).getNode(), Imm);
2069}
2070
2071static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2072 const APInt &Demanded,
2074 unsigned NewOpc) {
2075 uint64_t OldImm = Imm, NewImm, Enc;
2076 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2077
2078 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2079 // bimm64.
2080 if (Imm == 0 || Imm == Mask ||
2082 return false;
2083
2084 unsigned EltSize = Size;
2085 uint64_t DemandedBits = Demanded.getZExtValue();
2086
2087 // Clear bits that are not demanded.
2088 Imm &= DemandedBits;
2089
2090 while (true) {
2091 // The goal here is to set the non-demanded bits in a way that minimizes
2092 // the number of switching between 0 and 1. In order to achieve this goal,
2093 // we set the non-demanded bits to the value of the preceding demanded bits.
2094 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2095 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2096 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2097 // The final result is 0b11000011.
2098 uint64_t NonDemandedBits = ~DemandedBits;
2099 uint64_t InvertedImm = ~Imm & DemandedBits;
2100 uint64_t RotatedImm =
2101 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2102 NonDemandedBits;
2103 uint64_t Sum = RotatedImm + NonDemandedBits;
2104 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2105 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2106 NewImm = (Imm | Ones) & Mask;
2107
2108 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2109 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2110 // we halve the element size and continue the search.
2111 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2112 break;
2113
2114 // We cannot shrink the element size any further if it is 2-bits.
2115 if (EltSize == 2)
2116 return false;
2117
2118 EltSize /= 2;
2119 Mask >>= EltSize;
2120 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2121
2122 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2123 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2124 return false;
2125
2126 // Merge the upper and lower halves of Imm and DemandedBits.
2127 Imm |= Hi;
2128 DemandedBits |= DemandedBitsHi;
2129 }
2130
2131 ++NumOptimizedImms;
2132
2133 // Replicate the element across the register width.
2134 while (EltSize < Size) {
2135 NewImm |= NewImm << EltSize;
2136 EltSize *= 2;
2137 }
2138
2139 (void)OldImm;
2140 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2141 "demanded bits should never be altered");
2142 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2143
2144 // Create the new constant immediate node.
2145 EVT VT = Op.getValueType();
2146 SDLoc DL(Op);
2147 SDValue New;
2148
2149 // If the new constant immediate is all-zeros or all-ones, let the target
2150 // independent DAG combine optimize this node.
2151 if (NewImm == 0 || NewImm == OrigMask) {
2152 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2153 TLO.DAG.getConstant(NewImm, DL, VT));
2154 // Otherwise, create a machine node so that target independent DAG combine
2155 // doesn't undo this optimization.
2156 } else {
2158 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2159 New = SDValue(
2160 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2161 }
2162
2163 return TLO.CombineTo(Op, New);
2164}
2165
2167 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2168 TargetLoweringOpt &TLO) const {
2169 // Delay this optimization to as late as possible.
2170 if (!TLO.LegalOps)
2171 return false;
2172
2174 return false;
2175
2176 EVT VT = Op.getValueType();
2177 if (VT.isVector())
2178 return false;
2179
2180 unsigned Size = VT.getSizeInBits();
2181 assert((Size == 32 || Size == 64) &&
2182 "i32 or i64 is expected after legalization.");
2183
2184 // Exit early if we demand all bits.
2185 if (DemandedBits.popcount() == Size)
2186 return false;
2187
2188 unsigned NewOpc;
2189 switch (Op.getOpcode()) {
2190 default:
2191 return false;
2192 case ISD::AND:
2193 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2194 break;
2195 case ISD::OR:
2196 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2197 break;
2198 case ISD::XOR:
2199 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2200 break;
2201 }
2202 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2203 if (!C)
2204 return false;
2205 uint64_t Imm = C->getZExtValue();
2206 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2207}
2208
2209/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2210/// Mask are known to be either zero or one and return them Known.
2212 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2213 const SelectionDAG &DAG, unsigned Depth) const {
2214 switch (Op.getOpcode()) {
2215 default:
2216 break;
2217 case AArch64ISD::DUP: {
2218 SDValue SrcOp = Op.getOperand(0);
2219 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2220 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2221 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2222 "Expected DUP implicit truncation");
2223 Known = Known.trunc(Op.getScalarValueSizeInBits());
2224 }
2225 break;
2226 }
2227 case AArch64ISD::CSEL: {
2228 KnownBits Known2;
2229 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2230 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2231 Known = Known.intersectWith(Known2);
2232 break;
2233 }
2234 case AArch64ISD::BICi: {
2235 // Compute the bit cleared value.
2236 uint64_t Mask =
2237 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2238 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2239 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2240 break;
2241 }
2242 case AArch64ISD::VLSHR: {
2243 KnownBits Known2;
2244 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2245 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2246 Known = KnownBits::lshr(Known, Known2);
2247 break;
2248 }
2249 case AArch64ISD::VASHR: {
2250 KnownBits Known2;
2251 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2252 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2253 Known = KnownBits::ashr(Known, Known2);
2254 break;
2255 }
2256 case AArch64ISD::VSHL: {
2257 KnownBits Known2;
2258 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2259 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2260 Known = KnownBits::shl(Known, Known2);
2261 break;
2262 }
2263 case AArch64ISD::MOVI: {
2265 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2266 break;
2267 }
2269 case AArch64ISD::ADDlow: {
2270 if (!Subtarget->isTargetILP32())
2271 break;
2272 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2273 Known.Zero = APInt::getHighBitsSet(64, 32);
2274 break;
2275 }
2277 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2278 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2279 break;
2280 }
2282 Intrinsic::ID IntID =
2283 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2284 switch (IntID) {
2285 default: return;
2286 case Intrinsic::aarch64_ldaxr:
2287 case Intrinsic::aarch64_ldxr: {
2288 unsigned BitWidth = Known.getBitWidth();
2289 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2290 unsigned MemBits = VT.getScalarSizeInBits();
2291 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2292 return;
2293 }
2294 }
2295 break;
2296 }
2298 case ISD::INTRINSIC_VOID: {
2299 unsigned IntNo = Op.getConstantOperandVal(0);
2300 switch (IntNo) {
2301 default:
2302 break;
2303 case Intrinsic::aarch64_neon_uaddlv: {
2304 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2305 unsigned BitWidth = Known.getBitWidth();
2306 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2307 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2308 assert(BitWidth >= Bound && "Unexpected width!");
2310 Known.Zero |= Mask;
2311 }
2312 break;
2313 }
2314 case Intrinsic::aarch64_neon_umaxv:
2315 case Intrinsic::aarch64_neon_uminv: {
2316 // Figure out the datatype of the vector operand. The UMINV instruction
2317 // will zero extend the result, so we can mark as known zero all the
2318 // bits larger than the element datatype. 32-bit or larget doesn't need
2319 // this as those are legal types and will be handled by isel directly.
2320 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2321 unsigned BitWidth = Known.getBitWidth();
2322 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2323 assert(BitWidth >= 8 && "Unexpected width!");
2325 Known.Zero |= Mask;
2326 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2327 assert(BitWidth >= 16 && "Unexpected width!");
2329 Known.Zero |= Mask;
2330 }
2331 break;
2332 } break;
2333 }
2334 }
2335 }
2336}
2337
2339 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2340 unsigned Depth) const {
2341 EVT VT = Op.getValueType();
2342 unsigned VTBits = VT.getScalarSizeInBits();
2343 unsigned Opcode = Op.getOpcode();
2344 switch (Opcode) {
2345 case AArch64ISD::CMEQ:
2346 case AArch64ISD::CMGE:
2347 case AArch64ISD::CMGT:
2348 case AArch64ISD::CMHI:
2349 case AArch64ISD::CMHS:
2350 case AArch64ISD::FCMEQ:
2351 case AArch64ISD::FCMGE:
2352 case AArch64ISD::FCMGT:
2353 case AArch64ISD::CMEQz:
2354 case AArch64ISD::CMGEz:
2355 case AArch64ISD::CMGTz:
2356 case AArch64ISD::CMLEz:
2357 case AArch64ISD::CMLTz:
2358 case AArch64ISD::FCMEQz:
2359 case AArch64ISD::FCMGEz:
2360 case AArch64ISD::FCMGTz:
2361 case AArch64ISD::FCMLEz:
2362 case AArch64ISD::FCMLTz:
2363 // Compares return either 0 or all-ones
2364 return VTBits;
2365 }
2366
2367 return 1;
2368}
2369
2371 EVT) const {
2372 return MVT::i64;
2373}
2374
2376 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2377 unsigned *Fast) const {
2378 if (Subtarget->requiresStrictAlign())
2379 return false;
2380
2381 if (Fast) {
2382 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2383 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2384 // See comments in performSTORECombine() for more details about
2385 // these conditions.
2386
2387 // Code that uses clang vector extensions can mark that it
2388 // wants unaligned accesses to be treated as fast by
2389 // underspecifying alignment to be 1 or 2.
2390 Alignment <= 2 ||
2391
2392 // Disregard v2i64. Memcpy lowering produces those and splitting
2393 // them regresses performance on micro-benchmarks and olden/bh.
2394 VT == MVT::v2i64;
2395 }
2396 return true;
2397}
2398
2399// Same as above but handling LLTs instead.
2401 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2402 unsigned *Fast) const {
2403 if (Subtarget->requiresStrictAlign())
2404 return false;
2405
2406 if (Fast) {
2407 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2408 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2409 Ty.getSizeInBytes() != 16 ||
2410 // See comments in performSTORECombine() for more details about
2411 // these conditions.
2412
2413 // Code that uses clang vector extensions can mark that it
2414 // wants unaligned accesses to be treated as fast by
2415 // underspecifying alignment to be 1 or 2.
2416 Alignment <= 2 ||
2417
2418 // Disregard v2i64. Memcpy lowering produces those and splitting
2419 // them regresses performance on micro-benchmarks and olden/bh.
2420 Ty == LLT::fixed_vector(2, 64);
2421 }
2422 return true;
2423}
2424
2425FastISel *
2427 const TargetLibraryInfo *libInfo) const {
2428 return AArch64::createFastISel(funcInfo, libInfo);
2429}
2430
2431const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2432#define MAKE_CASE(V) \
2433 case V: \
2434 return #V;
2435 switch ((AArch64ISD::NodeType)Opcode) {
2437 break;
2754 }
2755#undef MAKE_CASE
2756 return nullptr;
2757}
2758
2761 MachineBasicBlock *MBB) const {
2762 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2763 // phi node:
2764
2765 // OrigBB:
2766 // [... previous instrs leading to comparison ...]
2767 // b.ne TrueBB
2768 // b EndBB
2769 // TrueBB:
2770 // ; Fallthrough
2771 // EndBB:
2772 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2773
2774 MachineFunction *MF = MBB->getParent();
2775 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2776 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2777 DebugLoc DL = MI.getDebugLoc();
2779
2780 Register DestReg = MI.getOperand(0).getReg();
2781 Register IfTrueReg = MI.getOperand(1).getReg();
2782 Register IfFalseReg = MI.getOperand(2).getReg();
2783 unsigned CondCode = MI.getOperand(3).getImm();
2784 bool NZCVKilled = MI.getOperand(4).isKill();
2785
2786 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2787 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2788 MF->insert(It, TrueBB);
2789 MF->insert(It, EndBB);
2790
2791 // Transfer rest of current basic-block to EndBB
2792 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2793 MBB->end());
2795
2796 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2797 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2798 MBB->addSuccessor(TrueBB);
2799 MBB->addSuccessor(EndBB);
2800
2801 // TrueBB falls through to the end.
2802 TrueBB->addSuccessor(EndBB);
2803
2804 if (!NZCVKilled) {
2805 TrueBB->addLiveIn(AArch64::NZCV);
2806 EndBB->addLiveIn(AArch64::NZCV);
2807 }
2808
2809 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2810 .addReg(IfTrueReg)
2811 .addMBB(TrueBB)
2812 .addReg(IfFalseReg)
2813 .addMBB(MBB);
2814
2815 MI.eraseFromParent();
2816 return EndBB;
2817}
2818
2820 MachineInstr &MI, MachineBasicBlock *BB) const {
2822 BB->getParent()->getFunction().getPersonalityFn())) &&
2823 "SEH does not use catchret!");
2824 return BB;
2825}
2826
2829 MachineBasicBlock *MBB) const {
2830 MachineFunction &MF = *MBB->getParent();
2831 MachineBasicBlock::iterator MBBI = MI.getIterator();
2833 const AArch64InstrInfo &TII =
2834 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2835 Register TargetReg = MI.getOperand(0).getReg();
2837 TII.probedStackAlloc(MBBI, TargetReg, false);
2838
2839 MI.eraseFromParent();
2840 return NextInst->getParent();
2841}
2842
2844AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2846 MachineBasicBlock *BB) const {
2847 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2848 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2849
2850 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2851 MIB.add(MI.getOperand(1)); // slice index register
2852 MIB.add(MI.getOperand(2)); // slice index offset
2853 MIB.add(MI.getOperand(3)); // pg
2854 MIB.add(MI.getOperand(4)); // base
2855 MIB.add(MI.getOperand(5)); // offset
2856
2857 MI.eraseFromParent(); // The pseudo is gone now.
2858 return BB;
2859}
2860
2863 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2865 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2866
2867 MIB.addReg(AArch64::ZA, RegState::Define);
2868 MIB.add(MI.getOperand(0)); // Vector select register
2869 MIB.add(MI.getOperand(1)); // Vector select offset
2870 MIB.add(MI.getOperand(2)); // Base
2871 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2872
2873 MI.eraseFromParent(); // The pseudo is gone now.
2874 return BB;
2875}
2876
2879 unsigned Opcode,
2880 bool Op0IsDef) const {
2881 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2883
2884 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2885 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2886 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2887 MIB.add(MI.getOperand(I));
2888
2889 MI.eraseFromParent(); // The pseudo is gone now.
2890 return BB;
2891}
2892
2894AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2896 MachineBasicBlock *BB, bool HasTile) const {
2897 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2898 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2899 unsigned StartIdx = 0;
2900
2901 if (HasTile) {
2902 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2903 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2904 StartIdx = 1;
2905 } else
2906 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2907
2908 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2909 MIB.add(MI.getOperand(I));
2910
2911 MI.eraseFromParent(); // The pseudo is gone now.
2912 return BB;
2913}
2914
2917 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2919 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2920 MIB.add(MI.getOperand(0)); // Mask
2921
2922 unsigned Mask = MI.getOperand(0).getImm();
2923 for (unsigned I = 0; I < 8; I++) {
2924 if (Mask & (1 << I))
2925 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2926 }
2927
2928 MI.eraseFromParent(); // The pseudo is gone now.
2929 return BB;
2930}
2931
2933 MachineInstr &MI, MachineBasicBlock *BB) const {
2934
2935 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2936 if (SMEOrigInstr != -1) {
2937 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2938 uint64_t SMEMatrixType =
2939 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2940 switch (SMEMatrixType) {
2942 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2944 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2946 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2948 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2950 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2952 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2953 }
2954 }
2955
2956 switch (MI.getOpcode()) {
2957 default:
2958#ifndef NDEBUG
2959 MI.dump();
2960#endif
2961 llvm_unreachable("Unexpected instruction for custom inserter!");
2962
2963 case AArch64::F128CSEL:
2964 return EmitF128CSEL(MI, BB);
2965 case TargetOpcode::STATEPOINT:
2966 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2967 // while bl call instruction (where statepoint will be lowered at the end)
2968 // has implicit def. This def is early-clobber as it will be set at
2969 // the moment of the call and earlier than any use is read.
2970 // Add this implicit dead def here as a workaround.
2971 MI.addOperand(*MI.getMF(),
2973 AArch64::LR, /*isDef*/ true,
2974 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2975 /*isUndef*/ false, /*isEarlyClobber*/ true));
2976 [[fallthrough]];
2977 case TargetOpcode::STACKMAP:
2978 case TargetOpcode::PATCHPOINT:
2979 return emitPatchPoint(MI, BB);
2980
2981 case TargetOpcode::PATCHABLE_EVENT_CALL:
2982 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2983 return BB;
2984
2985 case AArch64::CATCHRET:
2986 return EmitLoweredCatchRet(MI, BB);
2987
2988 case AArch64::PROBED_STACKALLOC_DYN:
2989 return EmitDynamicProbedAlloc(MI, BB);
2990
2991 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2992 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2993 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2994 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2995 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2996 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2997 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2998 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2999 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3000 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3001 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3002 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3003 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3004 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3005 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3006 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3007 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3008 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3009 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3010 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3011 case AArch64::LDR_ZA_PSEUDO:
3012 return EmitFill(MI, BB);
3013 case AArch64::LDR_TX_PSEUDO:
3014 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3015 case AArch64::STR_TX_PSEUDO:
3016 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3017 case AArch64::ZERO_M_PSEUDO:
3018 return EmitZero(MI, BB);
3019 case AArch64::ZERO_T_PSEUDO:
3020 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3021 }
3022}
3023
3024//===----------------------------------------------------------------------===//
3025// AArch64 Lowering private implementation.
3026//===----------------------------------------------------------------------===//
3027
3028//===----------------------------------------------------------------------===//
3029// Lowering Code
3030//===----------------------------------------------------------------------===//
3031
3032// Forward declarations of SVE fixed length lowering helpers
3037 SelectionDAG &DAG);
3040 EVT VT);
3041
3042/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3043static bool isZerosVector(const SDNode *N) {
3044 // Look through a bit convert.
3045 while (N->getOpcode() == ISD::BITCAST)
3046 N = N->getOperand(0).getNode();
3047
3049 return true;
3050
3051 if (N->getOpcode() != AArch64ISD::DUP)
3052 return false;
3053
3054 auto Opnd0 = N->getOperand(0);
3055 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3056}
3057
3058/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3059/// CC
3061 switch (CC) {
3062 default:
3063 llvm_unreachable("Unknown condition code!");
3064 case ISD::SETNE:
3065 return AArch64CC::NE;
3066 case ISD::SETEQ:
3067 return AArch64CC::EQ;
3068 case ISD::SETGT:
3069 return AArch64CC::GT;
3070 case ISD::SETGE:
3071 return AArch64CC::GE;
3072 case ISD::SETLT:
3073 return AArch64CC::LT;
3074 case ISD::SETLE:
3075 return AArch64CC::LE;
3076 case ISD::SETUGT:
3077 return AArch64CC::HI;
3078 case ISD::SETUGE:
3079 return AArch64CC::HS;
3080 case ISD::SETULT:
3081 return AArch64CC::LO;
3082 case ISD::SETULE:
3083 return AArch64CC::LS;
3084 }
3085}
3086
3087/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3089 AArch64CC::CondCode &CondCode,
3090 AArch64CC::CondCode &CondCode2) {
3091 CondCode2 = AArch64CC::AL;
3092 switch (CC) {
3093 default:
3094 llvm_unreachable("Unknown FP condition!");
3095 case ISD::SETEQ:
3096 case ISD::SETOEQ:
3097 CondCode = AArch64CC::EQ;
3098 break;
3099 case ISD::SETGT:
3100 case ISD::SETOGT:
3101 CondCode = AArch64CC::GT;
3102 break;
3103 case ISD::SETGE:
3104 case ISD::SETOGE:
3105 CondCode = AArch64CC::GE;
3106 break;
3107 case ISD::SETOLT:
3108 CondCode = AArch64CC::MI;
3109 break;
3110 case ISD::SETOLE:
3111 CondCode = AArch64CC::LS;
3112 break;
3113 case ISD::SETONE:
3114 CondCode = AArch64CC::MI;
3115 CondCode2 = AArch64CC::GT;
3116 break;
3117 case ISD::SETO:
3118 CondCode = AArch64CC::VC;
3119 break;
3120 case ISD::SETUO:
3121 CondCode = AArch64CC::VS;
3122 break;
3123 case ISD::SETUEQ:
3124 CondCode = AArch64CC::EQ;
3125 CondCode2 = AArch64CC::VS;
3126 break;
3127 case ISD::SETUGT:
3128 CondCode = AArch64CC::HI;
3129 break;
3130 case ISD::SETUGE:
3131 CondCode = AArch64CC::PL;
3132 break;
3133 case ISD::SETLT:
3134 case ISD::SETULT:
3135 CondCode = AArch64CC::LT;
3136 break;
3137 case ISD::SETLE:
3138 case ISD::SETULE:
3139 CondCode = AArch64CC::LE;
3140 break;
3141 case ISD::SETNE:
3142 case ISD::SETUNE:
3143 CondCode = AArch64CC::NE;
3144 break;
3145 }
3146}
3147
3148/// Convert a DAG fp condition code to an AArch64 CC.
3149/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3150/// should be AND'ed instead of OR'ed.
3152 AArch64CC::CondCode &CondCode,
3153 AArch64CC::CondCode &CondCode2) {
3154 CondCode2 = AArch64CC::AL;
3155 switch (CC) {
3156 default:
3157 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3158 assert(CondCode2 == AArch64CC::AL);
3159 break;
3160 case ISD::SETONE:
3161 // (a one b)
3162 // == ((a olt b) || (a ogt b))
3163 // == ((a ord b) && (a une b))
3164 CondCode = AArch64CC::VC;
3165 CondCode2 = AArch64CC::NE;
3166 break;
3167 case ISD::SETUEQ:
3168 // (a ueq b)
3169 // == ((a uno b) || (a oeq b))
3170 // == ((a ule b) && (a uge b))
3171 CondCode = AArch64CC::PL;
3172 CondCode2 = AArch64CC::LE;
3173 break;
3174 }
3175}
3176
3177/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3178/// CC usable with the vector instructions. Fewer operations are available
3179/// without a real NZCV register, so we have to use less efficient combinations
3180/// to get the same effect.
3182 AArch64CC::CondCode &CondCode,
3183 AArch64CC::CondCode &CondCode2,
3184 bool &Invert) {
3185 Invert = false;
3186 switch (CC) {
3187 default:
3188 // Mostly the scalar mappings work fine.
3189 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3190 break;
3191 case ISD::SETUO:
3192 Invert = true;
3193 [[fallthrough]];
3194 case ISD::SETO:
3195 CondCode = AArch64CC::MI;
3196 CondCode2 = AArch64CC::GE;
3197 break;
3198 case ISD::SETUEQ:
3199 case ISD::SETULT:
3200 case ISD::SETULE:
3201 case ISD::SETUGT:
3202 case ISD::SETUGE:
3203 // All of the compare-mask comparisons are ordered, but we can switch
3204 // between the two by a double inversion. E.g. ULE == !OGT.
3205 Invert = true;
3206 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3207 CondCode, CondCode2);
3208 break;
3209 }
3210}
3211
3213 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3214 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3215 LLVM_DEBUG(dbgs() << "Is imm " << C
3216 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3217 return IsLegal;
3218}
3219
3220// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3221// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3222// can be set differently by this operation. It comes down to whether
3223// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3224// everything is fine. If not then the optimization is wrong. Thus general
3225// comparisons are only valid if op2 != 0.
3226//
3227// So, finally, the only LLVM-native comparisons that don't mention C and V
3228// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3229// the absence of information about op2.
3231 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3232 (CC == ISD::SETEQ || CC == ISD::SETNE);
3233}
3234
3236 SelectionDAG &DAG, SDValue Chain,
3237 bool IsSignaling) {
3238 EVT VT = LHS.getValueType();
3239 assert(VT != MVT::f128);
3240
3241 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3242
3243 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3244 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3245 {Chain, LHS});
3246 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3247 {LHS.getValue(1), RHS});
3248 Chain = RHS.getValue(1);
3249 VT = MVT::f32;
3250 }
3251 unsigned Opcode =
3253 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3254}
3255
3257 const SDLoc &dl, SelectionDAG &DAG) {
3258 EVT VT = LHS.getValueType();
3259 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3260
3261 if (VT.isFloatingPoint()) {
3262 assert(VT != MVT::f128);
3263 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3264 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3265 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3266 VT = MVT::f32;
3267 }
3268 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3269 }
3270
3271 // The CMP instruction is just an alias for SUBS, and representing it as
3272 // SUBS means that it's possible to get CSE with subtract operations.
3273 // A later phase can perform the optimization of setting the destination
3274 // register to WZR/XZR if it ends up being unused.
3275 unsigned Opcode = AArch64ISD::SUBS;
3276
3277 if (isCMN(RHS, CC)) {
3278 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3279 Opcode = AArch64ISD::ADDS;
3280 RHS = RHS.getOperand(1);
3281 } else if (isCMN(LHS, CC)) {
3282 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3283 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3284 Opcode = AArch64ISD::ADDS;
3285 LHS = LHS.getOperand(1);
3286 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3287 if (LHS.getOpcode() == ISD::AND) {
3288 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3289 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3290 // of the signed comparisons.
3291 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3292 DAG.getVTList(VT, MVT_CC),
3293 LHS.getOperand(0),
3294 LHS.getOperand(1));
3295 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3296 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3297 return ANDSNode.getValue(1);
3298 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3299 // Use result of ANDS
3300 return LHS.getValue(1);
3301 }
3302 }
3303
3304 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3305 .getValue(1);
3306}
3307
3308/// \defgroup AArch64CCMP CMP;CCMP matching
3309///
3310/// These functions deal with the formation of CMP;CCMP;... sequences.
3311/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3312/// a comparison. They set the NZCV flags to a predefined value if their
3313/// predicate is false. This allows to express arbitrary conjunctions, for
3314/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3315/// expressed as:
3316/// cmp A
3317/// ccmp B, inv(CB), CA
3318/// check for CB flags
3319///
3320/// This naturally lets us implement chains of AND operations with SETCC
3321/// operands. And we can even implement some other situations by transforming
3322/// them:
3323/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3324/// negating the flags used in a CCMP/FCCMP operations.
3325/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3326/// by negating the flags we test for afterwards. i.e.
3327/// NEG (CMP CCMP CCCMP ...) can be implemented.
3328/// - Note that we can only ever negate all previously processed results.
3329/// What we can not implement by flipping the flags to test is a negation
3330/// of two sub-trees (because the negation affects all sub-trees emitted so
3331/// far, so the 2nd sub-tree we emit would also affect the first).
3332/// With those tools we can implement some OR operations:
3333/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3334/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3335/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3336/// elimination rules from earlier to implement the whole thing as a
3337/// CCMP/FCCMP chain.
3338///
3339/// As complete example:
3340/// or (or (setCA (cmp A)) (setCB (cmp B)))
3341/// (and (setCC (cmp C)) (setCD (cmp D)))"
3342/// can be reassociated to:
3343/// or (and (setCC (cmp C)) setCD (cmp D))
3344// (or (setCA (cmp A)) (setCB (cmp B)))
3345/// can be transformed to:
3346/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3347/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3348/// which can be implemented as:
3349/// cmp C
3350/// ccmp D, inv(CD), CC
3351/// ccmp A, CA, inv(CD)
3352/// ccmp B, CB, inv(CA)
3353/// check for CB flags
3354///
3355/// A counterexample is "or (and A B) (and C D)" which translates to
3356/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3357/// can only implement 1 of the inner (not) operations, but not both!
3358/// @{
3359
3360/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3362 ISD::CondCode CC, SDValue CCOp,
3363 AArch64CC::CondCode Predicate,
3364 AArch64CC::CondCode OutCC,
3365 const SDLoc &DL, SelectionDAG &DAG) {
3366 unsigned Opcode = 0;
3367 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3368
3369 if (LHS.getValueType().isFloatingPoint()) {
3370 assert(LHS.getValueType() != MVT::f128);
3371 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3372 LHS.getValueType() == MVT::bf16) {
3373 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3374 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3375 }
3376 Opcode = AArch64ISD::FCCMP;
3377 } else if (RHS.getOpcode() == ISD::SUB) {
3378 SDValue SubOp0 = RHS.getOperand(0);
3379 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3380 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3381 Opcode = AArch64ISD::CCMN;
3382 RHS = RHS.getOperand(1);
3383 }
3384 }
3385 if (Opcode == 0)
3386 Opcode = AArch64ISD::CCMP;
3387
3388 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3390 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3391 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3392 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3393}
3394
3395/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3396/// expressed as a conjunction. See \ref AArch64CCMP.
3397/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3398/// changing the conditions on the SETCC tests.
3399/// (this means we can call emitConjunctionRec() with
3400/// Negate==true on this sub-tree)
3401/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3402/// cannot do the negation naturally. We are required to
3403/// emit the subtree first in this case.
3404/// \param WillNegate Is true if are called when the result of this
3405/// subexpression must be negated. This happens when the
3406/// outer expression is an OR. We can use this fact to know
3407/// that we have a double negation (or (or ...) ...) that
3408/// can be implemented for free.
3409static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3410 bool &MustBeFirst, bool WillNegate,
3411 unsigned Depth = 0) {
3412 if (!Val.hasOneUse())
3413 return false;
3414 unsigned Opcode = Val->getOpcode();
3415 if (Opcode == ISD::SETCC) {
3416 if (Val->getOperand(0).getValueType() == MVT::f128)
3417 return false;
3418 CanNegate = true;
3419 MustBeFirst = false;
3420 return true;
3421 }
3422 // Protect against exponential runtime and stack overflow.
3423 if (Depth > 6)
3424 return false;
3425 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3426 bool IsOR = Opcode == ISD::OR;
3427 SDValue O0 = Val->getOperand(0);
3428 SDValue O1 = Val->getOperand(1);
3429 bool CanNegateL;
3430 bool MustBeFirstL;
3431 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3432 return false;
3433 bool CanNegateR;
3434 bool MustBeFirstR;
3435 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3436 return false;
3437
3438 if (MustBeFirstL && MustBeFirstR)
3439 return false;
3440
3441 if (IsOR) {
3442 // For an OR expression we need to be able to naturally negate at least
3443 // one side or we cannot do the transformation at all.
3444 if (!CanNegateL && !CanNegateR)
3445 return false;
3446 // If we the result of the OR will be negated and we can naturally negate
3447 // the leafs, then this sub-tree as a whole negates naturally.
3448 CanNegate = WillNegate && CanNegateL && CanNegateR;
3449 // If we cannot naturally negate the whole sub-tree, then this must be
3450 // emitted first.
3451 MustBeFirst = !CanNegate;
3452 } else {
3453 assert(Opcode == ISD::AND && "Must be OR or AND");
3454 // We cannot naturally negate an AND operation.
3455 CanNegate = false;
3456 MustBeFirst = MustBeFirstL || MustBeFirstR;
3457 }
3458 return true;
3459 }
3460 return false;
3461}
3462
3463/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3464/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3465/// Tries to transform the given i1 producing node @p Val to a series compare
3466/// and conditional compare operations. @returns an NZCV flags producing node
3467/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3468/// transformation was not possible.
3469/// \p Negate is true if we want this sub-tree being negated just by changing
3470/// SETCC conditions.
3472 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3473 AArch64CC::CondCode Predicate) {
3474 // We're at a tree leaf, produce a conditional comparison operation.
3475 unsigned Opcode = Val->getOpcode();
3476 if (Opcode == ISD::SETCC) {
3477 SDValue LHS = Val->getOperand(0);
3478 SDValue RHS = Val->getOperand(1);
3479 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3480 bool isInteger = LHS.getValueType().isInteger();
3481 if (Negate)
3482 CC = getSetCCInverse(CC, LHS.getValueType());
3483 SDLoc DL(Val);
3484 // Determine OutCC and handle FP special case.
3485 if (isInteger) {
3486 OutCC = changeIntCCToAArch64CC(CC);
3487 } else {
3488 assert(LHS.getValueType().isFloatingPoint());
3489 AArch64CC::CondCode ExtraCC;
3490 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3491 // Some floating point conditions can't be tested with a single condition
3492 // code. Construct an additional comparison in this case.
3493 if (ExtraCC != AArch64CC::AL) {
3494 SDValue ExtraCmp;
3495 if (!CCOp.getNode())
3496 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3497 else
3498 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3499 ExtraCC, DL, DAG);
3500 CCOp = ExtraCmp;
3501 Predicate = ExtraCC;
3502 }
3503 }
3504
3505 // Produce a normal comparison if we are first in the chain
3506 if (!CCOp)
3507 return emitComparison(LHS, RHS, CC, DL, DAG);
3508 // Otherwise produce a ccmp.
3509 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3510 DAG);
3511 }
3512 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3513
3514 bool IsOR = Opcode == ISD::OR;
3515
3516 SDValue LHS = Val->getOperand(0);
3517 bool CanNegateL;
3518 bool MustBeFirstL;
3519 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3520 assert(ValidL && "Valid conjunction/disjunction tree");
3521 (void)ValidL;
3522
3523 SDValue RHS = Val->getOperand(1);
3524 bool CanNegateR;
3525 bool MustBeFirstR;
3526 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3527 assert(ValidR && "Valid conjunction/disjunction tree");
3528 (void)ValidR;
3529
3530 // Swap sub-tree that must come first to the right side.
3531 if (MustBeFirstL) {
3532 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3533 std::swap(LHS, RHS);
3534 std::swap(CanNegateL, CanNegateR);
3535 std::swap(MustBeFirstL, MustBeFirstR);
3536 }
3537
3538 bool NegateR;
3539 bool NegateAfterR;
3540 bool NegateL;
3541 bool NegateAfterAll;
3542 if (Opcode == ISD::OR) {
3543 // Swap the sub-tree that we can negate naturally to the left.
3544 if (!CanNegateL) {
3545 assert(CanNegateR && "at least one side must be negatable");
3546 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3547 assert(!Negate);
3548 std::swap(LHS, RHS);
3549 NegateR = false;
3550 NegateAfterR = true;
3551 } else {
3552 // Negate the left sub-tree if possible, otherwise negate the result.
3553 NegateR = CanNegateR;
3554 NegateAfterR = !CanNegateR;
3555 }
3556 NegateL = true;
3557 NegateAfterAll = !Negate;
3558 } else {
3559 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3560 assert(!Negate && "Valid conjunction/disjunction tree");
3561
3562 NegateL = false;
3563 NegateR = false;
3564 NegateAfterR = false;
3565 NegateAfterAll = false;
3566 }
3567
3568 // Emit sub-trees.
3569 AArch64CC::CondCode RHSCC;
3570 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3571 if (NegateAfterR)
3572 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3573 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3574 if (NegateAfterAll)
3575 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3576 return CmpL;
3577}
3578
3579/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3580/// In some cases this is even possible with OR operations in the expression.
3581/// See \ref AArch64CCMP.
3582/// \see emitConjunctionRec().
3584 AArch64CC::CondCode &OutCC) {
3585 bool DummyCanNegate;
3586 bool DummyMustBeFirst;
3587 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3588 return SDValue();
3589
3590 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3591}
3592
3593/// @}
3594
3595/// Returns how profitable it is to fold a comparison's operand's shift and/or
3596/// extension operations.
3598 auto isSupportedExtend = [&](SDValue V) {
3599 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3600 return true;
3601
3602 if (V.getOpcode() == ISD::AND)
3603 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3604 uint64_t Mask = MaskCst->getZExtValue();
3605 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3606 }
3607
3608 return false;
3609 };
3610
3611 if (!Op.hasOneUse())
3612 return 0;
3613
3614 if (isSupportedExtend(Op))
3615 return 1;
3616
3617 unsigned Opc = Op.getOpcode();
3618 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3619 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3620 uint64_t Shift = ShiftCst->getZExtValue();
3621 if (isSupportedExtend(Op.getOperand(0)))
3622 return (Shift <= 4) ? 2 : 1;
3623 EVT VT = Op.getValueType();
3624 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3625 return 1;
3626 }
3627
3628 return 0;
3629}
3630
3632 SDValue &AArch64cc, SelectionDAG &DAG,
3633 const SDLoc &dl) {
3634 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3635 EVT VT = RHS.getValueType();
3636 uint64_t C = RHSC->getZExtValue();
3637 if (!isLegalArithImmed(C)) {
3638 // Constant does not fit, try adjusting it by one?
3639 switch (CC) {
3640 default:
3641 break;
3642 case ISD::SETLT:
3643 case ISD::SETGE:
3644 if ((VT == MVT::i32 && C != 0x80000000 &&
3645 isLegalArithImmed((uint32_t)(C - 1))) ||
3646 (VT == MVT::i64 && C != 0x80000000ULL &&
3647 isLegalArithImmed(C - 1ULL))) {
3649 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3650 RHS = DAG.getConstant(C, dl, VT);
3651 }
3652 break;
3653 case ISD::SETULT:
3654 case ISD::SETUGE:
3655 if ((VT == MVT::i32 && C != 0 &&
3656 isLegalArithImmed((uint32_t)(C - 1))) ||
3657 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3659 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3660 RHS = DAG.getConstant(C, dl, VT);
3661 }
3662 break;
3663 case ISD::SETLE:
3664 case ISD::SETGT:
3665 if ((VT == MVT::i32 && C != INT32_MAX &&
3666 isLegalArithImmed((uint32_t)(C + 1))) ||
3667 (VT == MVT::i64 && C != INT64_MAX &&
3668 isLegalArithImmed(C + 1ULL))) {
3670 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3671 RHS = DAG.getConstant(C, dl, VT);
3672 }
3673 break;
3674 case ISD::SETULE:
3675 case ISD::SETUGT:
3676 if ((VT == MVT::i32 && C != UINT32_MAX &&
3677 isLegalArithImmed((uint32_t)(C + 1))) ||
3678 (VT == MVT::i64 && C != UINT64_MAX &&
3679 isLegalArithImmed(C + 1ULL))) {
3681 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3682 RHS = DAG.getConstant(C, dl, VT);
3683 }
3684 break;
3685 }
3686 }
3687 }
3688
3689 // Comparisons are canonicalized so that the RHS operand is simpler than the
3690 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3691 // can fold some shift+extend operations on the RHS operand, so swap the
3692 // operands if that can be done.
3693 //
3694 // For example:
3695 // lsl w13, w11, #1
3696 // cmp w13, w12
3697 // can be turned into:
3698 // cmp w12, w11, lsl #1
3699 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3700 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3701
3703 std::swap(LHS, RHS);
3705 }
3706 }
3707
3708 SDValue Cmp;
3709 AArch64CC::CondCode AArch64CC;
3710 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3711 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3712
3713 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3714 // For the i8 operand, the largest immediate is 255, so this can be easily
3715 // encoded in the compare instruction. For the i16 operand, however, the
3716 // largest immediate cannot be encoded in the compare.
3717 // Therefore, use a sign extending load and cmn to avoid materializing the
3718 // -1 constant. For example,
3719 // movz w1, #65535
3720 // ldrh w0, [x0, #0]
3721 // cmp w0, w1
3722 // >
3723 // ldrsh w0, [x0, #0]
3724 // cmn w0, #1
3725 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3726 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3727 // ensure both the LHS and RHS are truly zero extended and to make sure the
3728 // transformation is profitable.
3729 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3730 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3731 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3732 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3733 int16_t ValueofRHS = RHS->getAsZExtVal();
3734 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3735 SDValue SExt =
3736 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3737 DAG.getValueType(MVT::i16));
3738 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3739 RHS.getValueType()),
3740 CC, dl, DAG);
3741 AArch64CC = changeIntCCToAArch64CC(CC);
3742 }
3743 }
3744
3745 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3746 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3747 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3748 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3749 }
3750 }
3751 }
3752
3753 if (!Cmp) {
3754 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3755 AArch64CC = changeIntCCToAArch64CC(CC);
3756 }
3757 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3758 return Cmp;
3759}
3760
3761static std::pair<SDValue, SDValue>
3763 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3764 "Unsupported value type");
3765 SDValue Value, Overflow;
3766 SDLoc DL(Op);
3767 SDValue LHS = Op.getOperand(0);
3768 SDValue RHS = Op.getOperand(1);
3769 unsigned Opc = 0;
3770 switch (Op.getOpcode()) {
3771 default:
3772 llvm_unreachable("Unknown overflow instruction!");
3773 case ISD::SADDO:
3774 Opc = AArch64ISD::ADDS;
3775 CC = AArch64CC::VS;
3776 break;
3777 case ISD::UADDO:
3778 Opc = AArch64ISD::ADDS;
3779 CC = AArch64CC::HS;
3780 break;
3781 case ISD::SSUBO:
3782 Opc = AArch64ISD::SUBS;
3783 CC = AArch64CC::VS;
3784 break;
3785 case ISD::USUBO:
3786 Opc = AArch64ISD::SUBS;
3787 CC = AArch64CC::LO;
3788 break;
3789 // Multiply needs a little bit extra work.
3790 case ISD::SMULO:
3791 case ISD::UMULO: {
3792 CC = AArch64CC::NE;
3793 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3794 if (Op.getValueType() == MVT::i32) {
3795 // Extend to 64-bits, then perform a 64-bit multiply.
3796 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3797 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3798 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3799 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3800 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3801
3802 // Check that the result fits into a 32-bit integer.
3803 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3804 if (IsSigned) {
3805 // cmp xreg, wreg, sxtw
3806 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3807 Overflow =
3808 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3809 } else {
3810 // tst xreg, #0xffffffff00000000
3811 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3812 Overflow =
3813 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3814 }
3815 break;
3816 }
3817 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3818 // For the 64 bit multiply
3819 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3820 if (IsSigned) {
3821 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3822 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3823 DAG.getConstant(63, DL, MVT::i64));
3824 // It is important that LowerBits is last, otherwise the arithmetic
3825 // shift will not be folded into the compare (SUBS).
3826 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3827 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3828 .getValue(1);
3829 } else {
3830 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3831 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3832 Overflow =
3833 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3834 DAG.getConstant(0, DL, MVT::i64),
3835 UpperBits).getValue(1);
3836 }
3837 break;
3838 }
3839 } // switch (...)
3840
3841 if (Opc) {
3842 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3843
3844 // Emit the AArch64 operation with overflow check.
3845 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3846 Overflow = Value.getValue(1);
3847 }
3848 return std::make_pair(Value, Overflow);
3849}
3850
3851SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3852 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3853 !Subtarget->isNeonAvailable()))
3854 return LowerToScalableOp(Op, DAG);
3855
3856 SDValue Sel = Op.getOperand(0);
3857 SDValue Other = Op.getOperand(1);
3858 SDLoc dl(Sel);
3859
3860 // If the operand is an overflow checking operation, invert the condition
3861 // code and kill the Not operation. I.e., transform:
3862 // (xor (overflow_op_bool, 1))
3863 // -->
3864 // (csel 1, 0, invert(cc), overflow_op_bool)
3865 // ... which later gets transformed to just a cset instruction with an
3866 // inverted condition code, rather than a cset + eor sequence.
3868 // Only lower legal XALUO ops.
3870 return SDValue();
3871
3872 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3873 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3875 SDValue Value, Overflow;
3876 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3877 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3878 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3879 CCVal, Overflow);
3880 }
3881 // If neither operand is a SELECT_CC, give up.
3882 if (Sel.getOpcode() != ISD::SELECT_CC)
3883 std::swap(Sel, Other);
3884 if (Sel.getOpcode() != ISD::SELECT_CC)
3885 return Op;
3886
3887 // The folding we want to perform is:
3888 // (xor x, (select_cc a, b, cc, 0, -1) )
3889 // -->
3890 // (csel x, (xor x, -1), cc ...)
3891 //
3892 // The latter will get matched to a CSINV instruction.
3893
3894 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3895 SDValue LHS = Sel.getOperand(0);
3896 SDValue RHS = Sel.getOperand(1);
3897 SDValue TVal = Sel.getOperand(2);
3898 SDValue FVal = Sel.getOperand(3);
3899
3900 // FIXME: This could be generalized to non-integer comparisons.
3901 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3902 return Op;
3903
3904 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3905 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3906
3907 // The values aren't constants, this isn't the pattern we're looking for.
3908 if (!CFVal || !CTVal)
3909 return Op;
3910
3911 // We can commute the SELECT_CC by inverting the condition. This
3912 // might be needed to make this fit into a CSINV pattern.
3913 if (CTVal->isAllOnes() && CFVal->isZero()) {
3914 std::swap(TVal, FVal);
3915 std::swap(CTVal, CFVal);
3916 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3917 }
3918
3919 // If the constants line up, perform the transform!
3920 if (CTVal->isZero() && CFVal->isAllOnes()) {
3921 SDValue CCVal;
3922 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3923
3924 FVal = Other;
3925 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3926 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3927
3928 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3929 CCVal, Cmp);
3930 }
3931
3932 return Op;
3933}
3934
3935// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3936// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3937// sets 'C' bit to 0.
3939 SDLoc DL(Value);
3940 EVT VT = Value.getValueType();
3941 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3942 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3943 SDValue Cmp =
3944 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3945 return Cmp.getValue(1);
3946}
3947
3948// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3949// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3951 bool Invert) {
3952 assert(Glue.getResNo() == 1);
3953 SDLoc DL(Glue);
3954 SDValue Zero = DAG.getConstant(0, DL, VT);
3955 SDValue One = DAG.getConstant(1, DL, VT);
3956 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3957 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3958 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3959}
3960
3961// Value is 1 if 'V' bit of NZCV is 1, else 0
3963 assert(Glue.getResNo() == 1);
3964 SDLoc DL(Glue);
3965 SDValue Zero = DAG.getConstant(0, DL, VT);
3966 SDValue One = DAG.getConstant(1, DL, VT);
3967 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3968 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3969}
3970
3971// This lowering is inefficient, but it will get cleaned up by
3972// `foldOverflowCheck`
3974 unsigned Opcode, bool IsSigned) {
3975 EVT VT0 = Op.getValue(0).getValueType();
3976 EVT VT1 = Op.getValue(1).getValueType();
3977
3978 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3979 return SDValue();
3980
3981 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3982 SDValue OpLHS = Op.getOperand(0);
3983 SDValue OpRHS = Op.getOperand(1);
3984 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3985
3986 SDLoc DL(Op);
3987 SDVTList VTs = DAG.getVTList(VT0, VT1);
3988
3989 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3990 OpRHS, OpCarryIn);
3991
3992 SDValue OutFlag =
3993 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3994 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3995
3996 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3997}
3998
4000 // Let legalize expand this if it isn't a legal type yet.
4001 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4002 return SDValue();
4003
4004 SDLoc dl(Op);
4006 // The actual operation that sets the overflow or carry flag.
4007 SDValue Value, Overflow;
4008 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4009
4010 // We use 0 and 1 as false and true values.
4011 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4012 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4013
4014 // We use an inverted condition, because the conditional select is inverted
4015 // too. This will allow it to be selected to a single instruction:
4016 // CSINC Wd, WZR, WZR, invert(cond).
4017 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4018 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4019 CCVal, Overflow);
4020
4021 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4022 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4023}
4024
4025// Prefetch operands are:
4026// 1: Address to prefetch
4027// 2: bool isWrite
4028// 3: int locality (0 = no locality ... 3 = extreme locality)
4029// 4: bool isDataCache
4031 SDLoc DL(Op);
4032 unsigned IsWrite = Op.getConstantOperandVal(2);
4033 unsigned Locality = Op.getConstantOperandVal(3);
4034 unsigned IsData = Op.getConstantOperandVal(4);
4035
4036 bool IsStream = !Locality;
4037 // When the locality number is set
4038 if (Locality) {
4039 // The front-end should have filtered out the out-of-range values
4040 assert(Locality <= 3 && "Prefetch locality out-of-range");
4041 // The locality degree is the opposite of the cache speed.
4042 // Put the number the other way around.
4043 // The encoding starts at 0 for level 1
4044 Locality = 3 - Locality;
4045 }
4046
4047 // built the mask value encoding the expected behavior.
4048 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4049 (!IsData << 3) | // IsDataCache bit
4050 (Locality << 1) | // Cache level bits
4051 (unsigned)IsStream; // Stream bit
4052 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4053 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4054 Op.getOperand(1));
4055}
4056
4057SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4058 SelectionDAG &DAG) const {
4059 EVT VT = Op.getValueType();
4060 if (VT.isScalableVector())
4061 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4062
4063 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4064 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4065
4066 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4067 return SDValue();
4068}
4069
4070SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4071 SelectionDAG &DAG) const {
4072 EVT VT = Op.getValueType();
4073 if (VT.isScalableVector())
4074 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4075
4076 bool IsStrict = Op->isStrictFPOpcode();
4077 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4078 EVT SrcVT = SrcVal.getValueType();
4079 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4080
4081 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4082 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4083
4084 // Expand cases where the result type is BF16 but we don't have hardware
4085 // instructions to lower it.
4086 if (VT.getScalarType() == MVT::bf16 &&
4087 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4088 Subtarget->hasBF16())) {
4089 SDLoc dl(Op);
4090 SDValue Narrow = SrcVal;
4091 SDValue NaN;
4092 EVT I32 = SrcVT.changeElementType(MVT::i32);
4093 EVT F32 = SrcVT.changeElementType(MVT::f32);
4094 if (SrcVT.getScalarType() == MVT::f32) {
4095 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4096 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4097 if (!NeverSNaN) {
4098 // Set the quiet bit.
4099 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4100 DAG.getConstant(0x400000, dl, I32));
4101 }
4102 } else if (SrcVT.getScalarType() == MVT::f64) {
4103 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4104 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4105 } else {
4106 return SDValue();
4107 }
4108 if (!Trunc) {
4109 SDValue One = DAG.getConstant(1, dl, I32);
4110 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4111 DAG.getShiftAmountConstant(16, I32, dl));
4112 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4113 SDValue RoundingBias =
4114 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4115 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4116 }
4117
4118 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4119 // 0x80000000.
4120 if (NaN) {
4121 SDValue IsNaN = DAG.getSetCC(
4122 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4123 SrcVal, SrcVal, ISD::SETUO);
4124 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4125 }
4126
4127 // Now that we have rounded, shift the bits into position.
4128 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4129 DAG.getShiftAmountConstant(16, I32, dl));
4130 if (VT.isVector()) {
4131 EVT I16 = I32.changeVectorElementType(MVT::i16);
4132 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4133 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4134 }
4135 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4136 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4137 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4138 : Result;
4139 }
4140
4141 if (SrcVT != MVT::f128) {
4142 // Expand cases where the input is a vector bigger than NEON.
4144 return SDValue();
4145
4146 // It's legal except when f128 is involved
4147 return Op;
4148 }
4149
4150 return SDValue();
4151}
4152
4153SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4154 SelectionDAG &DAG) const {
4155 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4156 // Any additional optimization in this function should be recorded
4157 // in the cost tables.
4158 bool IsStrict = Op->isStrictFPOpcode();
4159 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4160 EVT VT = Op.getValueType();
4161
4162 if (VT.isScalableVector()) {
4163 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4166 return LowerToPredicatedOp(Op, DAG, Opcode);
4167 }
4168
4169 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4170 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4171 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4172
4173 unsigned NumElts = InVT.getVectorNumElements();
4174
4175 // f16 conversions are promoted to f32 when full fp16 is not supported.
4176 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4177 InVT.getVectorElementType() == MVT::bf16) {
4178 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4179 SDLoc dl(Op);
4180 if (IsStrict) {
4181 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4182 {Op.getOperand(0), Op.getOperand(1)});
4183 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4184 {Ext.getValue(1), Ext.getValue(0)});
4185 }
4186 return DAG.getNode(
4187 Op.getOpcode(), dl, Op.getValueType(),
4188 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4189 }
4190
4191 uint64_t VTSize = VT.getFixedSizeInBits();
4192 uint64_t InVTSize = InVT.getFixedSizeInBits();
4193 if (VTSize < InVTSize) {
4194 SDLoc dl(Op);
4195 if (IsStrict) {
4197 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4198 {Op.getOperand(0), Op.getOperand(1)});
4199 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4200 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4201 }
4202 SDValue Cv =
4203 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4204 Op.getOperand(0));
4205 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4206 }
4207
4208 if (VTSize > InVTSize) {
4209 SDLoc dl(Op);
4210 MVT ExtVT =
4213 if (IsStrict) {
4214 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4215 {Op.getOperand(0), Op.getOperand(1)});
4216 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4217 {Ext.getValue(1), Ext.getValue(0)});
4218 }
4219 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4220 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4221 }
4222
4223 // Use a scalar operation for conversions between single-element vectors of
4224 // the same size.
4225 if (NumElts == 1) {
4226 SDLoc dl(Op);
4227 SDValue Extract = DAG.getNode(
4229 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4230 EVT ScalarVT = VT.getScalarType();
4231 if (IsStrict)
4232 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4233 {Op.getOperand(0), Extract});
4234 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4235 }
4236
4237 // Type changing conversions are illegal.
4238 return Op;
4239}
4240
4241SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4242 SelectionDAG &DAG) const {
4243 bool IsStrict = Op->isStrictFPOpcode();
4244 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4245
4246 if (SrcVal.getValueType().isVector())
4247 return LowerVectorFP_TO_INT(Op, DAG);
4248
4249 // f16 conversions are promoted to f32 when full fp16 is not supported.
4250 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4251 SrcVal.getValueType() == MVT::bf16) {
4252 SDLoc dl(Op);
4253 if (IsStrict) {
4254 SDValue Ext =
4255 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4256 {Op.getOperand(0), SrcVal});
4257 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4258 {Ext.getValue(1), Ext.getValue(0)});
4259 }
4260 return DAG.getNode(
4261 Op.getOpcode(), dl, Op.getValueType(),
4262 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4263 }
4264
4265 if (SrcVal.getValueType() != MVT::f128) {
4266 // It's legal except when f128 is involved
4267 return Op;
4268 }
4269
4270 return SDValue();
4271}
4272
4273SDValue
4274AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4275 SelectionDAG &DAG) const {
4276 // AArch64 FP-to-int conversions saturate to the destination element size, so
4277 // we can lower common saturating conversions to simple instructions.
4278 SDValue SrcVal = Op.getOperand(0);
4279 EVT SrcVT = SrcVal.getValueType();
4280 EVT DstVT = Op.getValueType();
4281 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4282
4283 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4284 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4285 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4286 assert(SatWidth <= DstElementWidth &&
4287 "Saturation width cannot exceed result width");
4288
4289 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4290 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4291 // types, so this is hard to reach.
4292 if (DstVT.isScalableVector())
4293 return SDValue();
4294
4295 EVT SrcElementVT = SrcVT.getVectorElementType();
4296
4297 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4298 if ((SrcElementVT == MVT::f16 &&
4299 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4300 SrcElementVT == MVT::bf16) {
4301 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4302 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4303 SrcVT = F32VT;
4304 SrcElementVT = MVT::f32;
4305 SrcElementWidth = 32;
4306 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4307 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4308 return SDValue();
4309
4310 SDLoc DL(Op);
4311 // Cases that we can emit directly.
4312 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4313 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4314 DAG.getValueType(DstVT.getScalarType()));
4315
4316 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4317 // result. This is only valid if the legal cvt is larger than the saturate
4318 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4319 // (at least until sqxtn is selected).
4320 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4321 return SDValue();
4322
4323 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4324 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4325 DAG.getValueType(IntVT.getScalarType()));
4326 SDValue Sat;
4327 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4328 SDValue MinC = DAG.getConstant(
4329 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4330 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4331 SDValue MaxC = DAG.getConstant(
4332 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4333 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4334 } else {
4335 SDValue MinC = DAG.getConstant(
4336 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4337 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4338 }
4339
4340 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4341}
4342
4343SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4344 SelectionDAG &DAG) const {
4345 // AArch64 FP-to-int conversions saturate to the destination register size, so
4346 // we can lower common saturating conversions to simple instructions.
4347 SDValue SrcVal = Op.getOperand(0);
4348 EVT SrcVT = SrcVal.getValueType();
4349
4350 if (SrcVT.isVector())
4351 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4352
4353 EVT DstVT = Op.getValueType();
4354 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4355 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4356 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4357 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4358
4359 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4360 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4361 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4362 SrcVT = MVT::f32;
4363 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4364 SrcVT != MVT::bf16)
4365 return SDValue();
4366
4367 SDLoc DL(Op);
4368 // Cases that we can emit directly.
4369 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4370 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4371 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4372 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4373 DAG.getValueType(DstVT));
4374
4375 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4376 // result. This is only valid if the legal cvt is larger than the saturate
4377 // width.
4378 if (DstWidth < SatWidth)
4379 return SDValue();
4380
4381 SDValue NativeCvt =
4382 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4383 SDValue Sat;
4384 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4385 SDValue MinC = DAG.getConstant(
4386 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4387 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4388 SDValue MaxC = DAG.getConstant(
4389 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4390 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4391 } else {
4392 SDValue MinC = DAG.getConstant(
4393 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4394 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4395 }
4396
4397 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4398}
4399
4400SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4401 SelectionDAG &DAG) const {
4402 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4403 // Any additional optimization in this function should be recorded
4404 // in the cost tables.
4405 bool IsStrict = Op->isStrictFPOpcode();
4406 EVT VT = Op.getValueType();
4407 SDLoc dl(Op);
4408 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4409 EVT InVT = In.getValueType();
4410 unsigned Opc = Op.getOpcode();
4411 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4412
4413 if (VT.isScalableVector()) {
4414 if (InVT.getVectorElementType() == MVT::i1) {
4415 // We can't directly extend an SVE predicate; extend it first.
4416 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4417 EVT CastVT = getPromotedVTForPredicate(InVT);
4418 In = DAG.getNode(CastOpc, dl, CastVT, In);
4419 return DAG.getNode(Opc, dl, VT, In);
4420 }
4421
4422 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4424 return LowerToPredicatedOp(Op, DAG, Opcode);
4425 }
4426
4427 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4428 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4429 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4430
4431 // Promote bf16 conversions to f32.
4432 if (VT.getVectorElementType() == MVT::bf16) {
4433 EVT F32 = VT.changeElementType(MVT::f32);
4434 if (IsStrict) {
4435 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4436 {Op.getOperand(0), In});
4437 return DAG.getNode(
4438 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4439 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4440 }
4441 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4442 DAG.getNode(Op.getOpcode(), dl, F32, In),
4443 DAG.getIntPtrConstant(0, dl));
4444 }
4445
4446 uint64_t VTSize = VT.getFixedSizeInBits();
4447 uint64_t InVTSize = InVT.getFixedSizeInBits();
4448 if (VTSize < InVTSize) {
4449 MVT CastVT =
4451 InVT.getVectorNumElements());
4452 if (IsStrict) {
4453 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4454 {Op.getOperand(0), In});
4455 return DAG.getNode(
4456 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4457 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4458 }
4459 In = DAG.getNode(Opc, dl, CastVT, In);
4460 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4461 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4462 }
4463
4464 if (VTSize > InVTSize) {
4465 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4467 In = DAG.getNode(CastOpc, dl, CastVT, In);
4468 if (IsStrict)
4469 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4470 return DAG.getNode(Opc, dl, VT, In);
4471 }
4472
4473 // Use a scalar operation for conversions between single-element vectors of
4474 // the same size.
4475 if (VT.getVectorNumElements() == 1) {
4476 SDValue Extract = DAG.getNode(
4478 In, DAG.getConstant(0, dl, MVT::i64));
4479 EVT ScalarVT = VT.getScalarType();
4480 if (IsStrict)
4481 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4482 {Op.getOperand(0), Extract});
4483 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4484 }
4485
4486 return Op;
4487}
4488
4489SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4490 SelectionDAG &DAG) const {
4491 if (Op.getValueType().isVector())
4492 return LowerVectorINT_TO_FP(Op, DAG);
4493
4494 bool IsStrict = Op->isStrictFPOpcode();
4495 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4496
4497 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4498 Op->getOpcode() == ISD::SINT_TO_FP;
4499
4500 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4501 SDLoc dl(Op);
4502 if (IsStrict) {
4503 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4504 {Op.getOperand(0), SrcVal});
4505 return DAG.getNode(
4506 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4507 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4508 }
4509 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4510 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4511 DAG.getIntPtrConstant(0, dl));
4512 };
4513
4514 if (Op.getValueType() == MVT::bf16) {
4515 unsigned MaxWidth = IsSigned
4516 ? DAG.ComputeMaxSignificantBits(SrcVal)
4517 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4518 // bf16 conversions are promoted to f32 when converting from i16.
4519 if (MaxWidth <= 24) {
4520 return IntToFpViaPromotion(MVT::f32);
4521 }
4522
4523 // bf16 conversions are promoted to f64 when converting from i32.
4524 if (MaxWidth <= 53) {
4525 return IntToFpViaPromotion(MVT::f64);
4526 }
4527
4528 // We need to be careful about i64 -> bf16.
4529 // Consider an i32 22216703.
4530 // This number cannot be represented exactly as an f32 and so a itofp will
4531 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4532 // However, the correct bf16 was supposed to be 22151168.0
4533 // We need to use sticky rounding to get this correct.
4534 if (SrcVal.getValueType() == MVT::i64) {
4535 SDLoc DL(Op);
4536 // This algorithm is equivalent to the following:
4537 // uint64_t SrcHi = SrcVal & ~0xfffull;
4538 // uint64_t SrcLo = SrcVal & 0xfffull;
4539 // uint64_t Highest = SrcVal >> 53;
4540 // bool HasHighest = Highest != 0;
4541 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4542 // double Rounded = static_cast<double>(ToRound);
4543 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4544 // uint64_t HasLo = SrcLo != 0;
4545 // bool NeedsAdjustment = HasHighest & HasLo;
4546 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4547 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4548 // return static_cast<__bf16>(Adjusted);
4549 //
4550 // Essentially, what happens is that SrcVal either fits perfectly in a
4551 // double-precision value or it is too big. If it is sufficiently small,
4552 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4553 // ensure that u64 -> double has no rounding error by only using the 52
4554 // MSB of the input. The low order bits will get merged into a sticky bit
4555 // which will avoid issues incurred by double rounding.
4556
4557 // Signed conversion is more or less like so:
4558 // copysign((__bf16)abs(SrcVal), SrcVal)
4559 SDValue SignBit;
4560 if (IsSigned) {
4561 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4562 DAG.getConstant(1ull << 63, DL, MVT::i64));
4563 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4564 }
4565 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4566 DAG.getConstant(~0xfffull, DL, MVT::i64));
4567 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4568 DAG.getConstant(0xfffull, DL, MVT::i64));
4570 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4571 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4572 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4573 SDValue ToRound =
4574 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4575 SDValue Rounded =
4576 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4577 {Op.getOperand(0), ToRound})
4578 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4579
4580 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4581 if (SignBit) {
4582 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4583 }
4584
4585 SDValue HasHighest = DAG.getSetCC(
4586 DL,
4587 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4588 Highest, Zero64, ISD::SETNE);
4589
4590 SDValue HasLo = DAG.getSetCC(
4591 DL,
4592 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4593 SrcLo, Zero64, ISD::SETNE);
4594
4595 SDValue NeedsAdjustment =
4596 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4597 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4598
4599 SDValue AdjustedBits =
4600 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4601 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4602 return IsStrict
4604 {Op.getValueType(), MVT::Other},
4605 {Rounded.getValue(1), Adjusted,
4606 DAG.getIntPtrConstant(0, DL)})
4607 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4608 DAG.getIntPtrConstant(0, DL, true));
4609 }
4610 }
4611
4612 // f16 conversions are promoted to f32 when full fp16 is not supported.
4613 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4614 return IntToFpViaPromotion(MVT::f32);
4615 }
4616
4617 // i128 conversions are libcalls.
4618 if (SrcVal.getValueType() == MVT::i128)
4619 return SDValue();
4620
4621 // Other conversions are legal, unless it's to the completely software-based
4622 // fp128.
4623 if (Op.getValueType() != MVT::f128)
4624 return Op;
4625 return SDValue();
4626}
4627
4628SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4629 SelectionDAG &DAG) const {
4630 // For iOS, we want to call an alternative entry point: __sincos_stret,
4631 // which returns the values in two S / D registers.
4632 SDLoc dl(Op);
4633 SDValue Arg = Op.getOperand(0);
4634 EVT ArgVT = Arg.getValueType();
4635 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4636
4638 ArgListEntry Entry;
4639
4640 Entry.Node = Arg;
4641 Entry.Ty = ArgTy;
4642 Entry.IsSExt = false;
4643 Entry.IsZExt = false;
4644 Args.push_back(Entry);
4645
4646 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4647 : RTLIB::SINCOS_STRET_F32;
4648 const char *LibcallName = getLibcallName(LC);
4649 SDValue Callee =
4650 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4651
4652 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4654 CLI.setDebugLoc(dl)
4655 .setChain(DAG.getEntryNode())
4656 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4657
4658 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4659 return CallResult.first;
4660}
4661
4662static MVT getSVEContainerType(EVT ContentTy);
4663
4664SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4665 SelectionDAG &DAG) const {
4666 EVT OpVT = Op.getValueType();
4667 EVT ArgVT = Op.getOperand(0).getValueType();
4668
4670 return LowerFixedLengthBitcastToSVE(Op, DAG);
4671
4672 if (OpVT.isScalableVector()) {
4673 // Bitcasting between unpacked vector types of different element counts is
4674 // not a NOP because the live elements are laid out differently.
4675 // 01234567
4676 // e.g. nxv2i32 = XX??XX??
4677 // nxv4f16 = X?X?X?X?
4678 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4679 return SDValue();
4680
4681 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4682 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4683 "Expected int->fp bitcast!");
4684 SDValue ExtResult =
4686 Op.getOperand(0));
4687 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4688 }
4689 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4690 }
4691
4692 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4693 return SDValue();
4694
4695 // Bitcasts between f16 and bf16 are legal.
4696 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4697 return Op;
4698
4699 assert(ArgVT == MVT::i16);
4700 SDLoc DL(Op);
4701
4702 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4703 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4704 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4705}
4706
4707static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4708 if (OrigVT.getSizeInBits() >= 64)
4709 return OrigVT;
4710
4711 assert(OrigVT.isSimple() && "Expecting a simple value type");
4712
4713 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4714 switch (OrigSimpleTy) {
4715 default: llvm_unreachable("Unexpected Vector Type");
4716 case MVT::v2i8:
4717 case MVT::v2i16:
4718 return MVT::v2i32;
4719 case MVT::v4i8:
4720 return MVT::v4i16;
4721 }
4722}
4723
4725 const EVT &OrigTy,
4726 const EVT &ExtTy,
4727 unsigned ExtOpcode) {
4728 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4729 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4730 // 64-bits we need to insert a new extension so that it will be 64-bits.
4731 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4732 if (OrigTy.getSizeInBits() >= 64)
4733 return N;
4734
4735 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4736 EVT NewVT = getExtensionTo64Bits(OrigTy);
4737
4738 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4739}
4740
4741// Returns lane if Op extracts from a two-element vector and lane is constant
4742// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4743static std::optional<uint64_t>
4745 SDNode *OpNode = Op.getNode();
4746 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4747 return std::nullopt;
4748
4749 EVT VT = OpNode->getOperand(0).getValueType();
4750 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4751 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4752 return std::nullopt;
4753
4754 return C->getZExtValue();
4755}
4756
4758 bool isSigned) {
4759 EVT VT = N.getValueType();
4760
4761 if (N.getOpcode() != ISD::BUILD_VECTOR)
4762 return false;
4763
4764 for (const SDValue &Elt : N->op_values()) {
4765 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4766 unsigned EltSize = VT.getScalarSizeInBits();
4767 unsigned HalfSize = EltSize / 2;
4768 if (isSigned) {
4769 if (!isIntN(HalfSize, C->getSExtValue()))
4770 return false;
4771 } else {
4772 if (!isUIntN(HalfSize, C->getZExtValue()))
4773 return false;
4774 }
4775 continue;
4776 }
4777 return false;
4778 }
4779
4780 return true;
4781}
4782
4784 EVT VT = N.getValueType();
4785 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4786
4787 unsigned NumElts = VT.getVectorNumElements();
4788 unsigned OrigEltSize = VT.getScalarSizeInBits();
4789 unsigned EltSize = OrigEltSize / 2;
4790 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4791
4792 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4793 if (DAG.MaskedValueIsZero(N, HiBits))
4794 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4795
4796 if (ISD::isExtOpcode(N.getOpcode()))
4797 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4798 N.getOperand(0).getValueType(), VT,
4799 N.getOpcode());
4800
4801 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4802 SDLoc dl(N);
4804 for (unsigned i = 0; i != NumElts; ++i) {
4805 const APInt &CInt = N.getConstantOperandAPInt(i);
4806 // Element types smaller than 32 bits are not legal, so use i32 elements.
4807 // The values are implicitly truncated so sext vs. zext doesn't matter.
4808 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4809 }
4810 return DAG.getBuildVector(TruncVT, dl, Ops);
4811}
4812
4814 return N.getOpcode() == ISD::SIGN_EXTEND ||
4815 N.getOpcode() == ISD::ANY_EXTEND ||
4816 isExtendedBUILD_VECTOR(N, DAG, true);
4817}
4818
4820 return N.getOpcode() == ISD::ZERO_EXTEND ||
4821 N.getOpcode() == ISD::ANY_EXTEND ||
4822 isExtendedBUILD_VECTOR(N, DAG, false);
4823}
4824
4826 unsigned Opcode = N.getOpcode();
4827 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4828 SDValue N0 = N.getOperand(0);
4829 SDValue N1 = N.getOperand(1);
4830 return N0->hasOneUse() && N1->hasOneUse() &&
4831 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4832 }
4833 return false;
4834}
4835
4837 unsigned Opcode = N.getOpcode();
4838 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4839 SDValue N0 = N.getOperand(0);
4840 SDValue N1 = N.getOperand(1);
4841 return N0->hasOneUse() && N1->hasOneUse() &&
4842 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4843 }
4844 return false;
4845}
4846
4847SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4848 SelectionDAG &DAG) const {
4849 // The rounding mode is in bits 23:22 of the FPSCR.
4850 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4851 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4852 // so that the shift + and get folded into a bitfield extract.
4853 SDLoc dl(Op);
4854
4855 SDValue Chain = Op.getOperand(0);
4856 SDValue FPCR_64 = DAG.getNode(
4857 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4858 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4859 Chain = FPCR_64.getValue(1);
4860 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4861 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4862 DAG.getConstant(1U << 22, dl, MVT::i32));
4863 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4864 DAG.getConstant(22, dl, MVT::i32));
4865 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4866 DAG.getConstant(3, dl, MVT::i32));
4867 return DAG.getMergeValues({AND, Chain}, dl);
4868}
4869
4870SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4871 SelectionDAG &DAG) const {
4872 SDLoc DL(Op);
4873 SDValue Chain = Op->getOperand(0);
4874 SDValue RMValue = Op->getOperand(1);
4875
4876 // The rounding mode is in bits 23:22 of the FPCR.
4877 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4878 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4879 // ((arg - 1) & 3) << 22).
4880 //
4881 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4882 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4883 // generated llvm.set.rounding to ensure this condition.
4884
4885 // Calculate new value of FPCR[23:22].
4886 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4887 DAG.getConstant(1, DL, MVT::i32));
4888 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4889 DAG.getConstant(0x3, DL, MVT::i32));
4890 RMValue =
4891 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4892 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4893 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4894
4895 // Get current value of FPCR.
4896 SDValue Ops[] = {
4897 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4898 SDValue FPCR =
4899 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4900 Chain = FPCR.getValue(1);
4901 FPCR = FPCR.getValue(0);
4902
4903 // Put new rounding mode into FPSCR[23:22].
4904 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4905 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4906 DAG.getConstant(RMMask, DL, MVT::i64));
4907 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4908 SDValue Ops2[] = {
4909 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4910 FPCR};
4911 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4912}
4913
4914static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4915 SDLoc DL, bool &IsMLA) {
4916 bool IsN0SExt = isSignExtended(N0, DAG);
4917 bool IsN1SExt = isSignExtended(N1, DAG);
4918 if (IsN0SExt && IsN1SExt)
4919 return AArch64ISD::SMULL;
4920
4921 bool IsN0ZExt = isZeroExtended(N0, DAG);
4922 bool IsN1ZExt = isZeroExtended(N1, DAG);
4923
4924 if (IsN0ZExt && IsN1ZExt)
4925 return AArch64ISD::UMULL;
4926
4927 // Select SMULL if we can replace zext with sext.
4928 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4929 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4930 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4931 SDValue ZextOperand;
4932 if (IsN0ZExt)
4933 ZextOperand = N0.getOperand(0);
4934 else
4935 ZextOperand = N1.getOperand(0);
4936 if (DAG.SignBitIsZero(ZextOperand)) {
4937 SDValue NewSext =
4938 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4939 if (IsN0ZExt)
4940 N0 = NewSext;
4941 else
4942 N1 = NewSext;
4943 return AArch64ISD::SMULL;
4944 }
4945 }
4946
4947 // Select UMULL if we can replace the other operand with an extend.
4948 if (IsN0ZExt || IsN1ZExt) {
4949 EVT VT = N0.getValueType();
4951 VT.getScalarSizeInBits() / 2);
4952 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4953 return AArch64ISD::UMULL;
4954 }
4955
4956 if (!IsN1SExt && !IsN1ZExt)
4957 return 0;
4958
4959 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4960 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4961 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4962 IsMLA = true;
4963 return AArch64ISD::SMULL;
4964 }
4965 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4966 IsMLA = true;
4967 return AArch64ISD::UMULL;
4968 }
4969 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4970 std::swap(N0, N1);
4971 IsMLA = true;
4972 return AArch64ISD::UMULL;
4973 }
4974 return 0;
4975}
4976
4977SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4978 EVT VT = Op.getValueType();
4979
4980 bool OverrideNEON = !Subtarget->isNeonAvailable();
4981 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4982 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4983
4984 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4985 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4986 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
4987 "unexpected type for custom-lowering ISD::MUL");
4988 SDValue N0 = Op.getOperand(0);
4989 SDValue N1 = Op.getOperand(1);
4990 bool isMLA = false;
4991 EVT OVT = VT;
4992 if (VT.is64BitVector()) {
4993 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4994 isNullConstant(N0.getOperand(1)) &&
4996 isNullConstant(N1.getOperand(1))) {
4997 N0 = N0.getOperand(0);
4998 N1 = N1.getOperand(0);
4999 VT = N0.getValueType();
5000 } else {
5001 if (VT == MVT::v1i64) {
5002 if (Subtarget->hasSVE())
5003 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5004 // Fall through to expand this. It is not legal.
5005 return SDValue();
5006 } else
5007 // Other vector multiplications are legal.
5008 return Op;
5009 }
5010 }
5011
5012 SDLoc DL(Op);
5013 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5014
5015 if (!NewOpc) {
5016 if (VT.getVectorElementType() == MVT::i64) {
5017 // If SVE is available then i64 vector multiplications can also be made
5018 // legal.
5019 if (Subtarget->hasSVE())
5020 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5021 // Fall through to expand this. It is not legal.
5022 return SDValue();
5023 } else
5024 // Other vector multiplications are legal.
5025 return Op;
5026 }
5027
5028 // Legalize to a S/UMULL instruction
5029 SDValue Op0;
5030 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5031 if (!isMLA) {
5032 Op0 = skipExtensionForVectorMULL(N0, DAG);
5034 Op1.getValueType().is64BitVector() &&
5035 "unexpected types for extended operands to VMULL");
5036 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5037 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5038 DAG.getConstant(0, DL, MVT::i64));
5039 }
5040 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5041 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5042 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5045 EVT Op1VT = Op1.getValueType();
5046 return DAG.getNode(
5048 DAG.getNode(N0.getOpcode(), DL, VT,
5049 DAG.getNode(NewOpc, DL, VT,
5050 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5051 DAG.getNode(NewOpc, DL, VT,
5052 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5053 DAG.getConstant(0, DL, MVT::i64));
5054}
5055
5056static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5057 int Pattern) {
5058 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5059 return DAG.getConstant(1, DL, MVT::nxv1i1);
5060 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5061 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5062}
5063
5064static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
5065 bool IsLess, bool IsEqual) {
5066 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5067 !isa<ConstantSDNode>(Op.getOperand(2)))
5068 return SDValue();
5069
5070 SDLoc dl(Op);
5071 APInt X = Op.getConstantOperandAPInt(1);
5072 APInt Y = Op.getConstantOperandAPInt(2);
5073 APInt NumActiveElems;
5074 bool Overflow;
5075 if (IsLess)
5076 NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5077 else
5078 NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow);
5079
5080 if (Overflow)
5081 return SDValue();
5082
5083 if (IsEqual) {
5084 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5085 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5086 : NumActiveElems.uadd_ov(One, Overflow);
5087 if (Overflow)
5088 return SDValue();
5089 }
5090
5091 std::optional<unsigned> PredPattern =
5093 unsigned MinSVEVectorSize = std::max(
5095 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5096 if (PredPattern != std::nullopt &&
5097 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5098 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5099
5100 return SDValue();
5101}
5102
5103// Returns a safe bitcast between two scalable vector predicates, where
5104// any newly created lanes from a widening bitcast are defined as zero.
5106 SDLoc DL(Op);
5107 EVT InVT = Op.getValueType();
5108
5109 assert(InVT.getVectorElementType() == MVT::i1 &&
5110 VT.getVectorElementType() == MVT::i1 &&
5111 "Expected a predicate-to-predicate bitcast");
5113 InVT.isScalableVector() &&
5114 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5115 "Only expect to cast between legal scalable predicate types!");
5116
5117 // Return the operand if the cast isn't changing type,
5118 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5119 if (InVT == VT)
5120 return Op;
5121
5122 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5123
5124 // We only have to zero the lanes if new lanes are being defined, e.g. when
5125 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5126 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5127 // we can return here.
5128 if (InVT.bitsGT(VT))
5129 return Reinterpret;
5130
5131 // Check if the other lanes are already known to be zeroed by
5132 // construction.
5134 return Reinterpret;
5135
5136 // Zero the newly introduced lanes.
5137 SDValue Mask = DAG.getConstant(1, DL, InVT);
5138 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5139 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5140}
5141
5142SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5143 SDValue Chain, SDLoc DL,
5144 EVT VT) const {
5145 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5147 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5148 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5151 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5153 RetTy, Callee, std::move(Args));
5154 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5155 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5156 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5157 Mask);
5158}
5159
5160// Lower an SME LDR/STR ZA intrinsic
5161// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5162// folded into the instruction
5163// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5164// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5165// and tile slice registers
5166// ldr(%tileslice, %ptr, %vecnum)
5167// ->
5168// %svl = rdsvl
5169// %ptr2 = %ptr + %svl * %vecnum
5170// %tileslice2 = %tileslice + %vecnum
5171// ldr [%tileslice2, 0], [%ptr2, 0]
5172// Case 3: If the vecnum is an immediate out of range, then the same is done as
5173// case 2, but the base and slice registers are modified by the greatest
5174// multiple of 15 lower than the vecnum and the remainder is folded into the
5175// instruction. This means that successive loads and stores that are offset from
5176// each other can share the same base and slice register updates.
5177// ldr(%tileslice, %ptr, 22)
5178// ldr(%tileslice, %ptr, 23)
5179// ->
5180// %svl = rdsvl
5181// %ptr2 = %ptr + %svl * 15
5182// %tileslice2 = %tileslice + 15
5183// ldr [%tileslice2, 7], [%ptr2, 7]
5184// ldr [%tileslice2, 8], [%ptr2, 8]
5185// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5186// operand and the immediate can be folded into the instruction, like case 2.
5187// ldr(%tileslice, %ptr, %vecnum + 7)
5188// ldr(%tileslice, %ptr, %vecnum + 8)
5189// ->
5190// %svl = rdsvl
5191// %ptr2 = %ptr + %svl * %vecnum
5192// %tileslice2 = %tileslice + %vecnum
5193// ldr [%tileslice2, 7], [%ptr2, 7]
5194// ldr [%tileslice2, 8], [%ptr2, 8]
5195// Case 5: The vecnum being an add of an immediate out of range is also handled,
5196// in which case the same remainder logic as case 3 is used.
5198 SDLoc DL(N);
5199
5200 SDValue TileSlice = N->getOperand(2);
5201 SDValue Base = N->getOperand(3);
5202 SDValue VecNum = N->getOperand(4);
5203 int32_t ConstAddend = 0;
5204 SDValue VarAddend = VecNum;
5205
5206 // If the vnum is an add of an immediate, we can fold it into the instruction
5207 if (VecNum.getOpcode() == ISD::ADD &&
5208 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5209 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5210 VarAddend = VecNum.getOperand(0);
5211 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5212 ConstAddend = ImmNode->getSExtValue();
5213 VarAddend = SDValue();
5214 }
5215
5216 int32_t ImmAddend = ConstAddend % 16;
5217 if (int32_t C = (ConstAddend - ImmAddend)) {
5218 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5219 VarAddend = VarAddend
5220 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5221 : CVal;
5222 }
5223
5224 if (VarAddend) {
5225 // Get the vector length that will be multiplied by vnum
5226 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5227 DAG.getConstant(1, DL, MVT::i32));
5228
5229 // Multiply SVL and vnum then add it to the base
5230 SDValue Mul = DAG.getNode(
5231 ISD::MUL, DL, MVT::i64,
5232 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5233 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5234 // Just add vnum to the tileslice
5235 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5236 }
5237
5239 DL, MVT::Other,
5240 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5241 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5242}
5243
5244SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5245 SelectionDAG &DAG) const {
5246 unsigned IntNo = Op.getConstantOperandVal(1);
5247 SDLoc DL(Op);
5248 switch (IntNo) {
5249 default:
5250 return SDValue(); // Don't custom lower most intrinsics.
5251 case Intrinsic::aarch64_prefetch: {
5252 SDValue Chain = Op.getOperand(0);
5253 SDValue Addr = Op.getOperand(2);
5254
5255 unsigned IsWrite = Op.getConstantOperandVal(3);
5256 unsigned Locality = Op.getConstantOperandVal(4);
5257 unsigned IsStream = Op.getConstantOperandVal(5);
5258 unsigned IsData = Op.getConstantOperandVal(6);
5259 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5260 (!IsData << 3) | // IsDataCache bit
5261 (Locality << 1) | // Cache level bits
5262 (unsigned)IsStream; // Stream bit
5263
5264 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5265 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5266 }
5267 case Intrinsic::aarch64_sme_str:
5268 case Intrinsic::aarch64_sme_ldr: {
5269 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5270 }
5271 case Intrinsic::aarch64_sme_za_enable:
5272 return DAG.getNode(
5273 AArch64ISD::SMSTART, DL, MVT::Other,
5274 Op->getOperand(0), // Chain
5275 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5276 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5277 case Intrinsic::aarch64_sme_za_disable:
5278 return DAG.getNode(
5279 AArch64ISD::SMSTOP, DL, MVT::Other,
5280 Op->getOperand(0), // Chain
5281 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5282 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5283 }
5284}
5285
5286SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5287 SelectionDAG &DAG) const {
5288 unsigned IntNo = Op.getConstantOperandVal(1);
5289 SDLoc DL(Op);
5290 switch (IntNo) {
5291 default:
5292 return SDValue(); // Don't custom lower most intrinsics.
5293 case Intrinsic::aarch64_mops_memset_tag: {
5294 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5295 SDValue Chain = Node->getChain();
5296 SDValue Dst = Op.getOperand(2);
5297 SDValue Val = Op.getOperand(3);
5298 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5299 SDValue Size = Op.getOperand(4);
5300 auto Alignment = Node->getMemOperand()->getAlign();
5301 bool IsVol = Node->isVolatile();
5302 auto DstPtrInfo = Node->getPointerInfo();
5303
5304 const auto &SDI =
5305 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5306 SDValue MS =
5307 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5308 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5309
5310 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5311 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5312 // LowerOperationWrapper will complain that the number of results has
5313 // changed.
5314 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5315 }
5316 }
5317}
5318
5319SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5320 SelectionDAG &DAG) const {
5321 unsigned IntNo = Op.getConstantOperandVal(0);
5322 SDLoc dl(Op);
5323 switch (IntNo) {
5324 default: return SDValue(); // Don't custom lower most intrinsics.
5325 case Intrinsic::thread_pointer: {
5326 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5327 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5328 }
5329 case Intrinsic::aarch64_neon_abs: {
5330 EVT Ty = Op.getValueType();
5331 if (Ty == MVT::i64) {
5332 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5333 Op.getOperand(1));
5334 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5335 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5336 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5337 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5338 } else {
5339 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5340 }
5341 }
5342 case Intrinsic::aarch64_neon_pmull64: {
5343 SDValue LHS = Op.getOperand(1);
5344 SDValue RHS = Op.getOperand(2);
5345
5346 std::optional<uint64_t> LHSLane =
5348 std::optional<uint64_t> RHSLane =
5350
5351 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5352 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5353
5354 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5355 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5356 // which ISel recognizes better. For example, generate a ldr into d*
5357 // registers as opposed to a GPR load followed by a fmov.
5358 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5359 std::optional<uint64_t> OtherLane,
5360 const SDLoc &dl,
5361 SelectionDAG &DAG) -> SDValue {
5362 // If the operand is an higher half itself, rewrite it to
5363 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5364 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5365 if (NLane && *NLane == 1)
5366 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5367 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5368
5369 // Operand N is not a higher half but the other operand is.
5370 if (OtherLane && *OtherLane == 1) {
5371 // If this operand is a lower half, rewrite it to
5372 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5373 // align lanes of two operands. A roundtrip sequence (to move from lane
5374 // 1 to lane 0) is like this:
5375 // mov x8, v0.d[1]
5376 // fmov d0, x8
5377 if (NLane && *NLane == 0)
5378 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5379 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5380 N.getOperand(0),
5381 DAG.getConstant(0, dl, MVT::i64)),
5382 DAG.getConstant(1, dl, MVT::i64));
5383
5384 // Otherwise just dup from main to all lanes.
5385 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5386 }
5387
5388 // Neither operand is an extract of higher half, so codegen may just use
5389 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5390 assert(N.getValueType() == MVT::i64 &&
5391 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5392 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5393 };
5394
5395 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5396 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5397
5398 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5399 }
5400 case Intrinsic::aarch64_neon_smax:
5401 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5402 Op.getOperand(1), Op.getOperand(2));
5403 case Intrinsic::aarch64_neon_umax:
5404 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5405 Op.getOperand(1), Op.getOperand(2));
5406 case Intrinsic::aarch64_neon_smin:
5407 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5408 Op.getOperand(1), Op.getOperand(2));
5409 case Intrinsic::aarch64_neon_umin:
5410 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5411 Op.getOperand(1), Op.getOperand(2));
5412 case Intrinsic::aarch64_neon_scalar_sqxtn:
5413 case Intrinsic::aarch64_neon_scalar_sqxtun:
5414 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5415 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5416 if (Op.getValueType() == MVT::i32)
5417 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5418 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5419 Op.getOperand(0),
5420 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5421 Op.getOperand(1))));
5422 return SDValue();
5423 }
5424 case Intrinsic::aarch64_sve_whilelo:
5425 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5426 /*IsEqual=*/false);
5427 case Intrinsic::aarch64_sve_whilelt:
5428 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5429 /*IsEqual=*/false);
5430 case Intrinsic::aarch64_sve_whilels:
5431 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5432 /*IsEqual=*/true);
5433 case Intrinsic::aarch64_sve_whilele:
5434 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5435 /*IsEqual=*/true);
5436 case Intrinsic::aarch64_sve_whilege:
5437 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5438 /*IsEqual=*/true);
5439 case Intrinsic::aarch64_sve_whilegt:
5440 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5441 /*IsEqual=*/false);
5442 case Intrinsic::aarch64_sve_whilehs:
5443 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5444 /*IsEqual=*/true);
5445 case Intrinsic::aarch64_sve_whilehi:
5446 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5447 /*IsEqual=*/false);
5448 case Intrinsic::aarch64_sve_sunpkhi:
5449 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5450 Op.getOperand(1));
5451 case Intrinsic::aarch64_sve_sunpklo:
5452 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5453 Op.getOperand(1));
5454 case Intrinsic::aarch64_sve_uunpkhi:
5455 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5456 Op.getOperand(1));
5457 case Intrinsic::aarch64_sve_uunpklo:
5458 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5459 Op.getOperand(1));
5460 case Intrinsic::aarch64_sve_clasta_n:
5461 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5462 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5463 case Intrinsic::aarch64_sve_clastb_n:
5464 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5465 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5466 case Intrinsic::aarch64_sve_lasta:
5467 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5468 Op.getOperand(1), Op.getOperand(2));
5469 case Intrinsic::aarch64_sve_lastb:
5470 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5471 Op.getOperand(1), Op.getOperand(2));
5472 case Intrinsic::aarch64_sve_rev:
5473 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5474 Op.getOperand(1));
5475 case Intrinsic::aarch64_sve_tbl:
5476 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5477 Op.getOperand(1), Op.getOperand(2));
5478 case Intrinsic::aarch64_sve_trn1:
5479 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5480 Op.getOperand(1), Op.getOperand(2));
5481 case Intrinsic::aarch64_sve_trn2:
5482 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5483 Op.getOperand(1), Op.getOperand(2));
5484 case Intrinsic::aarch64_sve_uzp1:
5485 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5486 Op.getOperand(1), Op.getOperand(2));
5487 case Intrinsic::aarch64_sve_uzp2:
5488 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5489 Op.getOperand(1), Op.getOperand(2));
5490 case Intrinsic::aarch64_sve_zip1:
5491 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5492 Op.getOperand(1), Op.getOperand(2));
5493 case Intrinsic::aarch64_sve_zip2:
5494 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5495 Op.getOperand(1), Op.getOperand(2));
5496 case Intrinsic::aarch64_sve_splice:
5497 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5498 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5499 case Intrinsic::aarch64_sve_ptrue:
5500 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5501 case Intrinsic::aarch64_sve_clz:
5502 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5503 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5504 case Intrinsic::aarch64_sme_cntsb:
5505 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5506 DAG.getConstant(1, dl, MVT::i32));
5507 case Intrinsic::aarch64_sme_cntsh: {
5508 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5509 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5510 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5511 }
5512 case Intrinsic::aarch64_sme_cntsw: {
5513 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5514 DAG.getConstant(1, dl, MVT::i32));
5515 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5516 DAG.getConstant(2, dl, MVT::i32));
5517 }
5518 case Intrinsic::aarch64_sme_cntsd: {
5519 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5520 DAG.getConstant(1, dl, MVT::i32));
5521 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5522 DAG.getConstant(3, dl, MVT::i32));
5523 }
5524 case Intrinsic::aarch64_sve_cnt: {
5525 SDValue Data = Op.getOperand(3);
5526 // CTPOP only supports integer operands.
5527 if (Data.getValueType().isFloatingPoint())
5528 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5529 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5530 Op.getOperand(2), Data, Op.getOperand(1));
5531 }
5532 case Intrinsic::aarch64_sve_dupq_lane:
5533 return LowerDUPQLane(Op, DAG);
5534 case Intrinsic::aarch64_sve_convert_from_svbool:
5535 if (Op.getValueType() == MVT::aarch64svcount)
5536 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5537 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5538 case Intrinsic::aarch64_sve_convert_to_svbool:
5539 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5540 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5541 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5542 case Intrinsic::aarch64_sve_fneg:
5543 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5544 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5545 case Intrinsic::aarch64_sve_frintp:
5546 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5547 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5548 case Intrinsic::aarch64_sve_frintm:
5549 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5550 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5551 case Intrinsic::aarch64_sve_frinti:
5552 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5553 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5554 case Intrinsic::aarch64_sve_frintx:
5555 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5556 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5557 case Intrinsic::aarch64_sve_frinta:
5558 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5559 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5560 case Intrinsic::aarch64_sve_frintn:
5561 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5562 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5563 case Intrinsic::aarch64_sve_frintz:
5564 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5565 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5566 case Intrinsic::aarch64_sve_ucvtf:
5568 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5569 Op.getOperand(1));
5570 case Intrinsic::aarch64_sve_scvtf:
5572 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5573 Op.getOperand(1));
5574 case Intrinsic::aarch64_sve_fcvtzu:
5576 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5577 Op.getOperand(1));
5578 case Intrinsic::aarch64_sve_fcvtzs:
5580 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5581 Op.getOperand(1));
5582 case Intrinsic::aarch64_sve_fsqrt:
5583 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5584 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5585 case Intrinsic::aarch64_sve_frecpx:
5586 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5587 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5588 case Intrinsic::aarch64_sve_frecpe_x:
5589 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5590 Op.getOperand(1));
5591 case Intrinsic::aarch64_sve_frecps_x:
5592 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5593 Op.getOperand(1), Op.getOperand(2));
5594 case Intrinsic::aarch64_sve_frsqrte_x:
5595 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5596 Op.getOperand(1));
5597 case Intrinsic::aarch64_sve_frsqrts_x:
5598 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5599 Op.getOperand(1), Op.getOperand(2));
5600 case Intrinsic::aarch64_sve_fabs:
5601 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5602 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5603 case Intrinsic::aarch64_sve_abs:
5604 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5605 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5606 case Intrinsic::aarch64_sve_neg:
5607 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5608 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5609 case Intrinsic::aarch64_sve_insr: {
5610 SDValue Scalar = Op.getOperand(2);
5611 EVT ScalarTy = Scalar.getValueType();
5612 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5613 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5614
5615 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5616 Op.getOperand(1), Scalar);
5617 }
5618 case Intrinsic::aarch64_sve_rbit:
5620 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5621 Op.getOperand(1));
5622 case Intrinsic::aarch64_sve_revb:
5623 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5624 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5625 case Intrinsic::aarch64_sve_revh:
5626 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5627 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5628 case Intrinsic::aarch64_sve_revw:
5629 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5630 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5631 case Intrinsic::aarch64_sve_revd:
5632 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5633 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5634 case Intrinsic::aarch64_sve_sxtb:
5635 return DAG.getNode(
5637 Op.getOperand(2), Op.getOperand(3),
5638 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5639 Op.getOperand(1));
5640 case Intrinsic::aarch64_sve_sxth:
5641 return DAG.getNode(
5643 Op.getOperand(2), Op.getOperand(3),
5644 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5645 Op.getOperand(1));
5646 case Intrinsic::aarch64_sve_sxtw:
5647 return DAG.getNode(
5649 Op.getOperand(2), Op.getOperand(3),
5650 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5651 Op.getOperand(1));
5652 case Intrinsic::aarch64_sve_uxtb:
5653 return DAG.getNode(
5655 Op.getOperand(2), Op.getOperand(3),
5656 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5657 Op.getOperand(1));
5658 case Intrinsic::aarch64_sve_uxth:
5659 return DAG.getNode(
5661 Op.getOperand(2), Op.getOperand(3),
5662 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5663 Op.getOperand(1));
5664 case Intrinsic::aarch64_sve_uxtw:
5665 return DAG.getNode(
5667 Op.getOperand(2), Op.getOperand(3),
5668 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5669 Op.getOperand(1));
5670 case Intrinsic::localaddress: {
5671 const auto &MF = DAG.getMachineFunction();
5672 const auto *RegInfo = Subtarget->getRegisterInfo();
5673 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5674 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5675 Op.getSimpleValueType());
5676 }
5677
5678 case Intrinsic::eh_recoverfp: {
5679 // FIXME: This needs to be implemented to correctly handle highly aligned
5680 // stack objects. For now we simply return the incoming FP. Refer D53541
5681 // for more details.
5682 SDValue FnOp = Op.getOperand(1);
5683 SDValue IncomingFPOp = Op.getOperand(2);
5684 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5685 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5686 if (!Fn)
5688 "llvm.eh.recoverfp must take a function as the first argument");
5689 return IncomingFPOp;
5690 }
5691
5692 case Intrinsic::aarch64_neon_vsri:
5693 case Intrinsic::aarch64_neon_vsli:
5694 case Intrinsic::aarch64_sve_sri:
5695 case Intrinsic::aarch64_sve_sli: {
5696 EVT Ty = Op.getValueType();
5697
5698 if (!Ty.isVector())
5699 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5700
5701 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5702
5703 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5704 IntNo == Intrinsic::aarch64_sve_sri;
5705 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5706 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5707 Op.getOperand(3));
5708 }
5709
5710 case Intrinsic::aarch64_neon_srhadd:
5711 case Intrinsic::aarch64_neon_urhadd:
5712 case Intrinsic::aarch64_neon_shadd:
5713 case Intrinsic::aarch64_neon_uhadd: {
5714 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5715 IntNo == Intrinsic::aarch64_neon_shadd);
5716 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5717 IntNo == Intrinsic::aarch64_neon_urhadd);
5718 unsigned Opcode = IsSignedAdd
5719 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5720 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5721 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5722 Op.getOperand(2));
5723 }
5724 case Intrinsic::aarch64_neon_saddlp:
5725 case Intrinsic::aarch64_neon_uaddlp: {
5726 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5729 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5730 }
5731 case Intrinsic::aarch64_neon_sdot:
5732 case Intrinsic::aarch64_neon_udot:
5733 case Intrinsic::aarch64_sve_sdot:
5734 case Intrinsic::aarch64_sve_udot: {
5735 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5736 IntNo == Intrinsic::aarch64_sve_udot)
5739 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5740 Op.getOperand(2), Op.getOperand(3));
5741 }
5742 case Intrinsic::get_active_lane_mask: {
5743 SDValue ID =
5744 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5745 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5746 Op.getOperand(1), Op.getOperand(2));
5747 }
5748 case Intrinsic::aarch64_neon_uaddlv: {
5749 EVT OpVT = Op.getOperand(1).getValueType();
5750 EVT ResVT = Op.getValueType();
5751 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5752 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5753 // In order to avoid insert_subvector, used v4i32 than v2i32.
5754 SDValue UADDLV =
5755 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5756 SDValue EXTRACT_VEC_ELT =
5757 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5758 DAG.getConstant(0, dl, MVT::i64));
5759 return EXTRACT_VEC_ELT;
5760 }
5761 return SDValue();
5762 }
5763 case Intrinsic::experimental_cttz_elts: {
5764 SDValue NewCttzElts =
5765 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5766
5767 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5768 }
5769 }
5770}
5771
5772bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5773 if (VT.getVectorElementType() == MVT::i8 ||
5774 VT.getVectorElementType() == MVT::i16) {
5775 EltTy = MVT::i32;
5776 return true;
5777 }
5778 return false;
5779}
5780
5781bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5782 EVT DataVT) const {
5783 const EVT IndexVT = Extend.getOperand(0).getValueType();
5784 // SVE only supports implicit extension of 32-bit indices.
5785 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5786 return false;
5787
5788 // Indices cannot be smaller than the main data type.
5789 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5790 return false;
5791
5792 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5793 // element container type, which would violate the previous clause.
5794 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5795}
5796
5797bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5798 EVT ExtVT = ExtVal.getValueType();
5799 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5800 return false;
5801
5802 // It may be worth creating extending masked loads if there are multiple
5803 // masked loads using the same predicate. That way we'll end up creating
5804 // extending masked loads that may then get split by the legaliser. This
5805 // results in just one set of predicate unpacks at the start, instead of
5806 // multiple sets of vector unpacks after each load.
5807 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5808 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5809 // Disable extending masked loads for fixed-width for now, since the code
5810 // quality doesn't look great.
5811 if (!ExtVT.isScalableVector())
5812 return false;
5813
5814 unsigned NumExtMaskedLoads = 0;
5815 for (auto *U : Ld->getMask()->uses())
5816 if (isa<MaskedLoadSDNode>(U))
5817 NumExtMaskedLoads++;
5818
5819 if (NumExtMaskedLoads <= 1)
5820 return false;
5821 }
5822 }
5823
5824 return true;
5825}
5826
5827unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5828 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5829 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5831 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5833 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5835 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5837 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5839 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5841 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5843 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5845 };
5846 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5847 return AddrModes.find(Key)->second;
5848}
5849
5850unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5851 switch (Opcode) {
5852 default:
5853 llvm_unreachable("unimplemented opcode");
5854 return Opcode;
5869 }
5870}
5871
5872SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5873 SelectionDAG &DAG) const {
5874 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5875
5876 SDLoc DL(Op);
5877 SDValue Chain = MGT->getChain();
5878 SDValue PassThru = MGT->getPassThru();
5879 SDValue Mask = MGT->getMask();
5880 SDValue BasePtr = MGT->getBasePtr();
5881 SDValue Index = MGT->getIndex();
5882 SDValue Scale = MGT->getScale();
5883 EVT VT = Op.getValueType();
5884 EVT MemVT = MGT->getMemoryVT();
5885 ISD::LoadExtType ExtType = MGT->getExtensionType();
5886 ISD::MemIndexType IndexType = MGT->getIndexType();
5887
5888 // SVE supports zero (and so undef) passthrough values only, everything else
5889 // must be handled manually by an explicit select on the load's output.
5890 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5891 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5892 SDValue Load =
5893 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5894 MGT->getMemOperand(), IndexType, ExtType);
5895 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5896 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5897 }
5898
5899 bool IsScaled = MGT->isIndexScaled();
5900 bool IsSigned = MGT->isIndexSigned();
5901
5902 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5903 // must be calculated before hand.
5904 uint64_t ScaleVal = Scale->getAsZExtVal();
5905 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5906 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5907 EVT IndexVT = Index.getValueType();
5908 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5909 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5910 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5911
5912 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5913 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5914 MGT->getMemOperand(), IndexType, ExtType);
5915 }
5916
5917 // Lower fixed length gather to a scalable equivalent.
5918 if (VT.isFixedLengthVector()) {
5919 assert(Subtarget->useSVEForFixedLengthVectors() &&
5920 "Cannot lower when not using SVE for fixed vectors!");
5921
5922 // NOTE: Handle floating-point as if integer then bitcast the result.
5924 MemVT = MemVT.changeVectorElementTypeToInteger();
5925
5926 // Find the smallest integer fixed length vector we can use for the gather.
5927 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5928 if (DataVT.getVectorElementType() == MVT::i64 ||
5929 Index.getValueType().getVectorElementType() == MVT::i64 ||
5930 Mask.getValueType().getVectorElementType() == MVT::i64)
5931 PromotedVT = VT.changeVectorElementType(MVT::i64);
5932
5933 // Promote vector operands except for passthrough, which we know is either
5934 // undef or zero, and thus best constructed directly.
5935 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5936 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5937 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5938
5939 // A promoted result type forces the need for an extending load.
5940 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5941 ExtType = ISD::EXTLOAD;
5942
5943 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5944
5945 // Convert fixed length vector operands to scalable.
5946 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5947 Index = convertToScalableVector(DAG, ContainerVT, Index);
5949 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5950 : DAG.getConstant(0, DL, ContainerVT);
5951
5952 // Emit equivalent scalable vector gather.
5953 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5954 SDValue Load =
5955 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5956 Ops, MGT->getMemOperand(), IndexType, ExtType);
5957
5958 // Extract fixed length data then convert to the required result type.
5959 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5960 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5961 if (VT.isFloatingPoint())
5962 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5963
5964 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5965 }
5966
5967 // Everything else is legal.
5968 return Op;
5969}
5970
5971SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5972 SelectionDAG &DAG) const {
5973 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5974
5975 SDLoc DL(Op);
5976 SDValue Chain = MSC->getChain();
5977 SDValue StoreVal = MSC->getValue();
5978 SDValue Mask = MSC->getMask();
5979 SDValue BasePtr = MSC->getBasePtr();
5980 SDValue Index = MSC->getIndex();
5981 SDValue Scale = MSC->getScale();
5982 EVT VT = StoreVal.getValueType();
5983 EVT MemVT = MSC->getMemoryVT();
5984 ISD::MemIndexType IndexType = MSC->getIndexType();
5985 bool Truncating = MSC->isTruncatingStore();
5986
5987 bool IsScaled = MSC->isIndexScaled();
5988 bool IsSigned = MSC->isIndexSigned();
5989
5990 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5991 // must be calculated before hand.
5992 uint64_t ScaleVal = Scale->getAsZExtVal();
5993 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5994 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5995 EVT IndexVT = Index.getValueType();
5996 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5997 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5998 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5999
6000 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6001 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6002 MSC->getMemOperand(), IndexType, Truncating);
6003 }
6004
6005 // Lower fixed length scatter to a scalable equivalent.
6006 if (VT.isFixedLengthVector()) {
6007 assert(Subtarget->useSVEForFixedLengthVectors() &&
6008 "Cannot lower when not using SVE for fixed vectors!");
6009
6010 // Once bitcast we treat floating-point scatters as if integer.
6011 if (VT.isFloatingPoint()) {
6013 MemVT = MemVT.changeVectorElementTypeToInteger();
6014 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6015 }
6016
6017 // Find the smallest integer fixed length vector we can use for the scatter.
6018 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6019 if (VT.getVectorElementType() == MVT::i64 ||
6020 Index.getValueType().getVectorElementType() == MVT::i64 ||
6021 Mask.getValueType().getVectorElementType() == MVT::i64)
6022 PromotedVT = VT.changeVectorElementType(MVT::i64);
6023
6024 // Promote vector operands.
6025 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6026 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6027 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6028 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6029
6030 // A promoted value type forces the need for a truncating store.
6031 if (PromotedVT != VT)
6032 Truncating = true;
6033
6034 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6035
6036 // Convert fixed length vector operands to scalable.
6037 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6038 Index = convertToScalableVector(DAG, ContainerVT, Index);
6040 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6041
6042 // Emit equivalent scalable vector scatter.
6043 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6044 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6045 MSC->getMemOperand(), IndexType, Truncating);
6046 }
6047
6048 // Everything else is legal.
6049 return Op;
6050}
6051
6052SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6053 SDLoc DL(Op);
6054 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6055 assert(LoadNode && "Expected custom lowering of a masked load node");
6056 EVT VT = Op->getValueType(0);
6057
6058 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6059 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6060
6061 SDValue PassThru = LoadNode->getPassThru();
6062 SDValue Mask = LoadNode->getMask();
6063
6064 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6065 return Op;
6066
6068 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6069 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6070 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6071 LoadNode->getExtensionType());
6072
6073 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6074
6075 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6076}
6077
6078// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6080 EVT VT, EVT MemVT,
6081 SelectionDAG &DAG) {
6082 assert(VT.isVector() && "VT should be a vector type");
6083 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6084
6085 SDValue Value = ST->getValue();
6086
6087 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6088 // the word lane which represent the v4i8 subvector. It optimizes the store
6089 // to:
6090 //
6091 // xtn v0.8b, v0.8h
6092 // str s0, [x0]
6093
6094 SDValue Undef = DAG.getUNDEF(MVT::i16);
6095 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6096 {Undef, Undef, Undef, Undef});
6097
6098 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6099 Value, UndefVec);
6100 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6101
6102 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6103 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6104 Trunc, DAG.getConstant(0, DL, MVT::i64));
6105
6106 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6107 ST->getBasePtr(), ST->getMemOperand());
6108}
6109
6110// Custom lowering for any store, vector or scalar and/or default or with
6111// a truncate operations. Currently only custom lower truncate operation
6112// from vector v4i16 to v4i8 or volatile stores of i128.
6113SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6114 SelectionDAG &DAG) const {
6115 SDLoc Dl(Op);
6116 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6117 assert (StoreNode && "Can only custom lower store nodes");
6118
6119 SDValue Value = StoreNode->getValue();
6120
6121 EVT VT = Value.getValueType();
6122 EVT MemVT = StoreNode->getMemoryVT();
6123
6124 if (VT.isVector()) {
6126 VT,
6127 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6128 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6129
6130 unsigned AS = StoreNode->getAddressSpace();
6131 Align Alignment = StoreNode->getAlign();
6132 if (Alignment < MemVT.getStoreSize() &&
6133 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6134 StoreNode->getMemOperand()->getFlags(),
6135 nullptr)) {
6136 return scalarizeVectorStore(StoreNode, DAG);
6137 }
6138
6139 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6140 MemVT == MVT::v4i8) {
6141 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6142 }
6143 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6144 // the custom lowering, as there are no un-paired non-temporal stores and
6145 // legalization will break up 256 bit inputs.
6147 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6148 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6149 (MemVT.getScalarSizeInBits() == 8u ||
6150 MemVT.getScalarSizeInBits() == 16u ||
6151 MemVT.getScalarSizeInBits() == 32u ||
6152 MemVT.getScalarSizeInBits() == 64u)) {
6153 SDValue Lo =
6156 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6157 SDValue Hi =
6160 StoreNode->getValue(),
6161 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6163 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6164 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6165 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6166 return Result;
6167 }
6168 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6169 return LowerStore128(Op, DAG);
6170 } else if (MemVT == MVT::i64x8) {
6171 SDValue Value = StoreNode->getValue();
6172 assert(Value->getValueType(0) == MVT::i64x8);
6173 SDValue Chain = StoreNode->getChain();
6174 SDValue Base = StoreNode->getBasePtr();
6175 EVT PtrVT = Base.getValueType();
6176 for (unsigned i = 0; i < 8; i++) {
6177 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6178 Value, DAG.getConstant(i, Dl, MVT::i32));
6179 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6180 DAG.getConstant(i * 8, Dl, PtrVT));
6181 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6182 StoreNode->getOriginalAlign());
6183 }
6184 return Chain;
6185 }
6186
6187 return SDValue();
6188}
6189
6190/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6191SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6192 SelectionDAG &DAG) const {
6193 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6194 assert(StoreNode->getMemoryVT() == MVT::i128);
6195 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6196
6197 bool IsStoreRelease =
6199 if (StoreNode->isAtomic())
6200 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6201 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6204
6205 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6206 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6207 ? StoreNode->getOperand(1)
6208 : StoreNode->getOperand(2);
6209 SDLoc DL(Op);
6210 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6211 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6212 if (DAG.getDataLayout().isBigEndian())
6213 std::swap(StoreValue.first, StoreValue.second);
6215 Opcode, DL, DAG.getVTList(MVT::Other),
6216 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6217 StoreNode->getBasePtr()},
6218 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6219 return Result;
6220}
6221
6222SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6223 SelectionDAG &DAG) const {
6224 SDLoc DL(Op);
6225 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6226 assert(LoadNode && "Expected custom lowering of a load node");
6227
6228 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6230 SDValue Base = LoadNode->getBasePtr();
6231 SDValue Chain = LoadNode->getChain();
6232 EVT PtrVT = Base.getValueType();
6233 for (unsigned i = 0; i < 8; i++) {
6234 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6235 DAG.getConstant(i * 8, DL, PtrVT));
6236 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6237 LoadNode->getPointerInfo(),
6238 LoadNode->getOriginalAlign());
6239 Ops.push_back(Part);
6240 Chain = SDValue(Part.getNode(), 1);
6241 }
6242 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6243 return DAG.getMergeValues({Loaded, Chain}, DL);
6244 }
6245
6246 // Custom lowering for extending v4i8 vector loads.
6247 EVT VT = Op->getValueType(0);
6248 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6249
6250 if (LoadNode->getMemoryVT() != MVT::v4i8)
6251 return SDValue();
6252
6253 unsigned ExtType;
6254 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6255 ExtType = ISD::SIGN_EXTEND;
6256 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6257 LoadNode->getExtensionType() == ISD::EXTLOAD)
6258 ExtType = ISD::ZERO_EXTEND;
6259 else
6260 return SDValue();
6261
6262 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6263 LoadNode->getBasePtr(), MachinePointerInfo());
6264 SDValue Chain = Load.getValue(1);
6265 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6266 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6267 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6268 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6269 DAG.getConstant(0, DL, MVT::i64));
6270 if (VT == MVT::v4i32)
6271 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6272 return DAG.getMergeValues({Ext, Chain}, DL);
6273}
6274
6275// Generate SUBS and CSEL for integer abs.
6276SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6277 MVT VT = Op.getSimpleValueType();
6278
6279 if (VT.isVector())
6280 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6281
6282 SDLoc DL(Op);
6283 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6284 Op.getOperand(0));
6285 // Generate SUBS & CSEL.
6286 SDValue Cmp =
6287 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6288 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6289 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6290 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6291 Cmp.getValue(1));
6292}
6293
6295 SDValue Chain = Op.getOperand(0);
6296 SDValue Cond = Op.getOperand(1);
6297 SDValue Dest = Op.getOperand(2);
6298
6300 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6301 SDLoc dl(Op);
6302 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6303 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6304 Cmp);
6305 }
6306
6307 return SDValue();
6308}
6309
6310// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6311// FSHL is converted to FSHR before deciding what to do with it
6313 SDValue Shifts = Op.getOperand(2);
6314 // Check if the shift amount is a constant
6315 // If opcode is FSHL, convert it to FSHR
6316 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6317 SDLoc DL(Op);
6318 MVT VT = Op.getSimpleValueType();
6319
6320 if (Op.getOpcode() == ISD::FSHL) {
6321 unsigned int NewShiftNo =
6322 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6323 return DAG.getNode(
6324 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6325 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6326 } else if (Op.getOpcode() == ISD::FSHR) {
6327 return Op;
6328 }
6329 }
6330
6331 return SDValue();
6332}
6333
6335 SDValue X = Op.getOperand(0);
6336 EVT XScalarTy = X.getValueType();
6337 SDValue Exp = Op.getOperand(1);
6338
6339 SDLoc DL(Op);
6340 EVT XVT, ExpVT;
6341 switch (Op.getSimpleValueType().SimpleTy) {
6342 default:
6343 return SDValue();
6344 case MVT::bf16:
6345 case MVT::f16:
6346 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6347 [[fallthrough]];
6348 case MVT::f32:
6349 XVT = MVT::nxv4f32;
6350 ExpVT = MVT::nxv4i32;
6351 break;
6352 case MVT::f64:
6353 XVT = MVT::nxv2f64;
6354 ExpVT = MVT::nxv2i64;
6355 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6356 break;
6357 }
6358
6359 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6360 SDValue VX =
6361 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6362 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6363 DAG.getUNDEF(ExpVT), Exp, Zero);
6364 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6365 AArch64SVEPredPattern::all);
6366 SDValue FScale =
6368 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6369 VPg, VX, VExp);
6370 SDValue Final =
6371 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6372 if (X.getValueType() != XScalarTy)
6373 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6374 DAG.getIntPtrConstant(1, SDLoc(Op)));
6375 return Final;
6376}
6377
6379 SelectionDAG &DAG) const {
6380 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6381 LLVM_DEBUG(Op.dump());
6382
6383 switch (Op.getOpcode()) {
6384 default:
6385 llvm_unreachable("unimplemented operand");
6386 return SDValue();
6387 case ISD::BITCAST:
6388 return LowerBITCAST(Op, DAG);
6389 case ISD::GlobalAddress:
6390 return LowerGlobalAddress(Op, DAG);
6392 return LowerGlobalTLSAddress(Op, DAG);
6393 case ISD::SETCC:
6394 case ISD::STRICT_FSETCC:
6396 return LowerSETCC(Op, DAG);
6397 case ISD::SETCCCARRY:
6398 return LowerSETCCCARRY(Op, DAG);
6399 case ISD::BRCOND:
6400 return LowerBRCOND(Op, DAG);
6401 case ISD::BR_CC:
6402 return LowerBR_CC(Op, DAG);
6403 case ISD::SELECT:
6404 return LowerSELECT(Op, DAG);
6405 case ISD::SELECT_CC:
6406 return LowerSELECT_CC(Op, DAG);
6407 case ISD::JumpTable:
6408 return LowerJumpTable(Op, DAG);
6409 case ISD::BR_JT:
6410 return LowerBR_JT(Op, DAG);
6411 case ISD::ConstantPool:
6412 return LowerConstantPool(Op, DAG);
6413 case ISD::BlockAddress:
6414 return LowerBlockAddress(Op, DAG);
6415 case ISD::VASTART:
6416 return LowerVASTART(Op, DAG);
6417 case ISD::VACOPY:
6418 return LowerVACOPY(Op, DAG);
6419 case ISD::VAARG:
6420 return LowerVAARG(Op, DAG);
6421 case ISD::UADDO_CARRY:
6422 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6423 case ISD::USUBO_CARRY:
6424 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6425 case ISD::SADDO_CARRY:
6426 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6427 case ISD::SSUBO_CARRY:
6428 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6429 case ISD::SADDO:
6430 case ISD::UADDO:
6431 case ISD::SSUBO:
6432 case ISD::USUBO:
6433 case ISD::SMULO:
6434 case ISD::UMULO:
6435 return LowerXALUO(Op, DAG);
6436 case ISD::FADD:
6437 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6438 case ISD::FSUB:
6439 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6440 case ISD::FMUL:
6441 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6442 case ISD::FMA:
6443 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6444 case ISD::FDIV:
6445 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6446 case ISD::FNEG:
6447 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6448 case ISD::FCEIL:
6449 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6450 case ISD::FFLOOR:
6451 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6452 case ISD::FNEARBYINT:
6453 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6454 case ISD::FRINT:
6455 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6456 case ISD::FROUND:
6457 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6458 case ISD::FROUNDEVEN:
6459 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6460 case ISD::FTRUNC:
6461 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6462 case ISD::FSQRT:
6463 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6464 case ISD::FABS:
6465 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6466 case ISD::FP_ROUND:
6468 return LowerFP_ROUND(Op, DAG);
6469 case ISD::FP_EXTEND:
6470 return LowerFP_EXTEND(Op, DAG);
6471 case ISD::FRAMEADDR:
6472 return LowerFRAMEADDR(Op, DAG);
6473 case ISD::SPONENTRY:
6474 return LowerSPONENTRY(Op, DAG);
6475 case ISD::RETURNADDR:
6476 return LowerRETURNADDR(Op, DAG);
6478 return LowerADDROFRETURNADDR(Op, DAG);
6480 return LowerCONCAT_VECTORS(Op, DAG);
6482 return LowerINSERT_VECTOR_ELT(Op, DAG);
6484 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6485 case ISD::BUILD_VECTOR:
6486 return LowerBUILD_VECTOR(Op, DAG);
6488 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6490 return LowerVECTOR_SHUFFLE(Op, DAG);
6491 case ISD::SPLAT_VECTOR:
6492 return LowerSPLAT_VECTOR(Op, DAG);
6494 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6496 return LowerINSERT_SUBVECTOR(Op, DAG);
6497 case ISD::SDIV:
6498 case ISD::UDIV:
6499 return LowerDIV(Op, DAG);
6500 case ISD::SMIN:
6501 case ISD::UMIN:
6502 case ISD::SMAX:
6503 case ISD::UMAX:
6504 return LowerMinMax(Op, DAG);
6505 case ISD::SRA:
6506 case ISD::SRL:
6507 case ISD::SHL:
6508 return LowerVectorSRA_SRL_SHL(Op, DAG);
6509 case ISD::SHL_PARTS:
6510 case ISD::SRL_PARTS:
6511 case ISD::SRA_PARTS:
6512 return LowerShiftParts(Op, DAG);
6513 case ISD::CTPOP:
6514 case ISD::PARITY:
6515 return LowerCTPOP_PARITY(Op, DAG);
6516 case ISD::FCOPYSIGN:
6517 return LowerFCOPYSIGN(Op, DAG);
6518 case ISD::OR:
6519 return LowerVectorOR(Op, DAG);
6520 case ISD::XOR:
6521 return LowerXOR(Op, DAG);
6522 case ISD::PREFETCH:
6523 return LowerPREFETCH(Op, DAG);
6524 case ISD::SINT_TO_FP:
6525 case ISD::UINT_TO_FP:
6528 return LowerINT_TO_FP(Op, DAG);
6529 case ISD::FP_TO_SINT:
6530 case ISD::FP_TO_UINT:
6533 return LowerFP_TO_INT(Op, DAG);
6536 return LowerFP_TO_INT_SAT(Op, DAG);
6537 case ISD::FSINCOS:
6538 return LowerFSINCOS(Op, DAG);
6539 case ISD::GET_ROUNDING:
6540 return LowerGET_ROUNDING(Op, DAG);
6541 case ISD::SET_ROUNDING:
6542 return LowerSET_ROUNDING(Op, DAG);
6543 case ISD::MUL:
6544 return LowerMUL(Op, DAG);
6545 case ISD::MULHS:
6546 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6547 case ISD::MULHU:
6548 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6550 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6552 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6554 return LowerINTRINSIC_VOID(Op, DAG);
6555 case ISD::ATOMIC_STORE:
6556 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6557 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6558 return LowerStore128(Op, DAG);
6559 }
6560 return SDValue();
6561 case ISD::STORE:
6562 return LowerSTORE(Op, DAG);
6563 case ISD::MSTORE:
6564 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6565 case ISD::MGATHER:
6566 return LowerMGATHER(Op, DAG);
6567 case ISD::MSCATTER:
6568 return LowerMSCATTER(Op, DAG);
6570 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6571 case ISD::VECREDUCE_ADD:
6572 case ISD::VECREDUCE_AND:
6573 case ISD::VECREDUCE_OR:
6574 case ISD::VECREDUCE_XOR:
6584 return LowerVECREDUCE(Op, DAG);
6586 return LowerATOMIC_LOAD_AND(Op, DAG);
6588 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6589 case ISD::VSCALE:
6590 return LowerVSCALE(Op, DAG);
6591 case ISD::ANY_EXTEND:
6592 case ISD::SIGN_EXTEND:
6593 case ISD::ZERO_EXTEND:
6594 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6596 // Only custom lower when ExtraVT has a legal byte based element type.
6597 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6598 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6599 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6600 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6601 return SDValue();
6602
6603 return LowerToPredicatedOp(Op, DAG,
6605 }
6606 case ISD::TRUNCATE:
6607 return LowerTRUNCATE(Op, DAG);
6608 case ISD::MLOAD:
6609 return LowerMLOAD(Op, DAG);
6610 case ISD::LOAD:
6611 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6612 !Subtarget->isNeonAvailable()))
6613 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6614 return LowerLOAD(Op, DAG);
6615 case ISD::ADD:
6616 case ISD::AND:
6617 case ISD::SUB:
6618 return LowerToScalableOp(Op, DAG);
6619 case ISD::FMAXIMUM:
6620 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6621 case ISD::FMAXNUM:
6622 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6623 case ISD::FMINIMUM:
6624 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6625 case ISD::FMINNUM:
6626 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6627 case ISD::VSELECT:
6628 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6629 case ISD::ABS:
6630 return LowerABS(Op, DAG);
6631 case ISD::ABDS:
6632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6633 case ISD::ABDU:
6634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6635 case ISD::AVGFLOORS:
6636 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6637 case ISD::AVGFLOORU:
6638 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6639 case ISD::AVGCEILS:
6640 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6641 case ISD::AVGCEILU:
6642 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6643 case ISD::BITREVERSE:
6644 return LowerBitreverse(Op, DAG);
6645 case ISD::BSWAP:
6646 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6647 case ISD::CTLZ:
6648 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6649 case ISD::CTTZ:
6650 return LowerCTTZ(Op, DAG);
6651 case ISD::VECTOR_SPLICE:
6652 return LowerVECTOR_SPLICE(Op, DAG);
6654 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6656 return LowerVECTOR_INTERLEAVE(Op, DAG);
6657 case ISD::LROUND:
6658 case ISD::LLROUND:
6659 case ISD::LRINT:
6660 case ISD::LLRINT: {
6661 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6662 Op.getOperand(0).getValueType() == MVT::bf16) &&
6663 "Expected custom lowering of rounding operations only for f16");
6664 SDLoc DL(Op);
6665 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6666 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6667 }
6668 case ISD::STRICT_LROUND:
6670 case ISD::STRICT_LRINT:
6671 case ISD::STRICT_LLRINT: {
6672 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6673 Op.getOperand(1).getValueType() == MVT::bf16) &&
6674 "Expected custom lowering of rounding operations only for f16");
6675 SDLoc DL(Op);
6676 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6677 {Op.getOperand(0), Op.getOperand(1)});
6678 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6679 {Ext.getValue(1), Ext.getValue(0)});
6680 }
6681 case ISD::WRITE_REGISTER: {
6682 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6683 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6684 SDLoc DL(Op);
6685
6686 SDValue Chain = Op.getOperand(0);
6687 SDValue SysRegName = Op.getOperand(1);
6688 std::pair<SDValue, SDValue> Pair =
6689 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6690
6691 // chain = MSRR(chain, sysregname, lo, hi)
6692 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6693 SysRegName, Pair.first, Pair.second);
6694
6695 return Result;
6696 }
6697 case ISD::FSHL:
6698 case ISD::FSHR:
6699 return LowerFunnelShift(Op, DAG);
6700 case ISD::FLDEXP:
6701 return LowerFLDEXP(Op, DAG);
6702 }
6703}
6704
6706 return !Subtarget->useSVEForFixedLengthVectors();
6707}
6708
6710 EVT VT, bool OverrideNEON) const {
6711 if (!VT.isFixedLengthVector() || !VT.isSimple())
6712 return false;
6713
6714 // Don't use SVE for vectors we cannot scalarize if required.
6715 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6716 // Fixed length predicates should be promoted to i8.
6717 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6718 case MVT::i1:
6719 default:
6720 return false;
6721 case MVT::i8:
6722 case MVT::i16:
6723 case MVT::i32:
6724 case MVT::i64:
6725 case MVT::f16:
6726 case MVT::f32:
6727 case MVT::f64:
6728 break;
6729 }
6730
6731 // NEON-sized vectors can be emulated using SVE instructions.
6732 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6733 return Subtarget->hasSVEorSME();
6734
6735 // Ensure NEON MVTs only belong to a single register class.
6736 if (VT.getFixedSizeInBits() <= 128)
6737 return false;
6738
6739 // Ensure wider than NEON code generation is enabled.
6740 if (!Subtarget->useSVEForFixedLengthVectors())
6741 return false;
6742
6743 // Don't use SVE for types that don't fit.
6744 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6745 return false;
6746
6747 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6748 // the base fixed length SVE support in place.
6749 if (!VT.isPow2VectorType())
6750 return false;
6751
6752 return true;
6753}
6754
6755//===----------------------------------------------------------------------===//
6756// Calling Convention Implementation
6757//===----------------------------------------------------------------------===//
6758
6759static unsigned getIntrinsicID(const SDNode *N) {
6760 unsigned Opcode = N->getOpcode();
6761 switch (Opcode) {
6762 default:
6765 unsigned IID = N->getConstantOperandVal(0);
6766 if (IID < Intrinsic::num_intrinsics)
6767 return IID;
6769 }
6770 }
6771}
6772
6774 SDValue N1) const {
6775 if (!N0.hasOneUse())
6776 return false;
6777
6778 unsigned IID = getIntrinsicID(N1.getNode());
6779 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6780 if (IID == Intrinsic::aarch64_neon_umull ||
6781 N1.getOpcode() == AArch64ISD::UMULL ||
6782 IID == Intrinsic::aarch64_neon_smull ||
6784 return N0.getOpcode() != ISD::ADD;
6785
6786 return true;
6787}
6788
6789/// Selects the correct CCAssignFn for a given CallingConvention value.
6791 bool IsVarArg) const {
6792 switch (CC) {
6793 default:
6794 report_fatal_error("Unsupported calling convention.");
6795 case CallingConv::GHC:
6796 return CC_AArch64_GHC;
6797 case CallingConv::C:
6798 case CallingConv::Fast:
6802 case CallingConv::Swift:
6804 case CallingConv::Tail:
6805 case CallingConv::GRAAL:
6806 if (Subtarget->isTargetWindows()) {
6807 if (IsVarArg) {
6808 if (Subtarget->isWindowsArm64EC())
6811 }
6812 return CC_AArch64_Win64PCS;
6813 }
6814 if (!Subtarget->isTargetDarwin())
6815 return CC_AArch64_AAPCS;
6816 if (!IsVarArg)
6817 return CC_AArch64_DarwinPCS;
6820 case CallingConv::Win64:
6821 if (IsVarArg) {
6822 if (Subtarget->isWindowsArm64EC())
6825 }
6826 return CC_AArch64_Win64PCS;
6828 if (Subtarget->isWindowsArm64EC())
6835 return CC_AArch64_AAPCS;
6840 }
6841}
6842
6843CCAssignFn *
6845 switch (CC) {
6846 default:
6847 return RetCC_AArch64_AAPCS;
6851 if (Subtarget->isWindowsArm64EC())
6853 return RetCC_AArch64_AAPCS;
6854 }
6855}
6856
6857
6858unsigned
6859AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6860 SelectionDAG &DAG) const {
6862 MachineFrameInfo &MFI = MF.getFrameInfo();
6863
6864 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6865 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6866 DAG.getConstant(1, DL, MVT::i32));
6867 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6868 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6869 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6870 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6871 Chain = Buffer.getValue(1);
6872 MFI.CreateVariableSizedObject(Align(1), nullptr);
6873
6874 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6875 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6876
6877 // Store the buffer pointer to the TPIDR2 stack object.
6880 TPIDR2Obj,
6882 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6883
6884 // Set the reserved bytes (10-15) to zero
6885 EVT PtrTy = Ptr.getValueType();
6886 SDValue ReservedPtr =
6887 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6888 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6889 MPI);
6890 ReservedPtr =
6891 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6892 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6893 MPI);
6894
6895 return TPIDR2Obj;
6896}
6897
6898SDValue AArch64TargetLowering::LowerFormalArguments(
6899 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6900 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6901 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6903 const Function &F = MF.getFunction();
6904 MachineFrameInfo &MFI = MF.getFrameInfo();
6905 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6906 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6907 (isVarArg && Subtarget->isWindowsArm64EC());
6909
6911 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6913 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6914 FuncInfo->setIsSVECC(true);
6915
6916 // Assign locations to all of the incoming arguments.
6918 DenseMap<unsigned, SDValue> CopiedRegs;
6919 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6920
6921 // At this point, Ins[].VT may already be promoted to i32. To correctly
6922 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6923 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6924 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6925 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6926 // LocVT.
6927 unsigned NumArgs = Ins.size();
6928 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6929 unsigned CurArgIdx = 0;
6930 for (unsigned i = 0; i != NumArgs; ++i) {
6931 MVT ValVT = Ins[i].VT;
6932 if (Ins[i].isOrigArg()) {
6933 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6934 CurArgIdx = Ins[i].getOrigArgIndex();
6935
6936 // Get type of the original argument.
6937 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6938 /*AllowUnknown*/ true);
6939 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6940 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6941 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6942 ValVT = MVT::i8;
6943 else if (ActualMVT == MVT::i16)
6944 ValVT = MVT::i16;
6945 }
6946 bool UseVarArgCC = false;
6947 if (IsWin64)
6948 UseVarArgCC = isVarArg;
6949 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6950 bool Res =
6951 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6952 assert(!Res && "Call operand has unhandled type");
6953 (void)Res;
6954 }
6955
6957 bool IsLocallyStreaming =
6958 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6959 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6960 SDValue Glue = Chain.getValue(1);
6961
6962 SmallVector<SDValue, 16> ArgValues;
6963 unsigned ExtraArgLocs = 0;
6964 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6965 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6966
6967 if (Ins[i].Flags.isByVal()) {
6968 // Byval is used for HFAs in the PCS, but the system should work in a
6969 // non-compliant manner for larger structs.
6970 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6971 int Size = Ins[i].Flags.getByValSize();
6972 unsigned NumRegs = (Size + 7) / 8;
6973
6974 // FIXME: This works on big-endian for composite byvals, which are the common
6975 // case. It should also work for fundamental types too.
6976 unsigned FrameIdx =
6977 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6978 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6979 InVals.push_back(FrameIdxN);
6980
6981 continue;
6982 }
6983
6984 if (Ins[i].Flags.isSwiftAsync())
6986
6987 SDValue ArgValue;
6988 if (VA.isRegLoc()) {
6989 // Arguments stored in registers.
6990 EVT RegVT = VA.getLocVT();
6991 const TargetRegisterClass *RC;
6992
6993 if (RegVT == MVT::i32)
6994 RC = &AArch64::GPR32RegClass;
6995 else if (RegVT == MVT::i64)
6996 RC = &AArch64::GPR64RegClass;
6997 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6998 RC = &AArch64::FPR16RegClass;
6999 else if (RegVT == MVT::f32)
7000 RC = &AArch64::FPR32RegClass;
7001 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7002 RC = &AArch64::FPR64RegClass;
7003 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7004 RC = &AArch64::FPR128RegClass;
7005 else if (RegVT.isScalableVector() &&
7006 RegVT.getVectorElementType() == MVT::i1) {
7007 FuncInfo->setIsSVECC(true);
7008 RC = &AArch64::PPRRegClass;
7009 } else if (RegVT == MVT::aarch64svcount) {
7010 FuncInfo->setIsSVECC(true);
7011 RC = &AArch64::PPRRegClass;
7012 } else if (RegVT.isScalableVector()) {
7013 FuncInfo->setIsSVECC(true);
7014 RC = &AArch64::ZPRRegClass;
7015 } else
7016 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7017
7018 // Transform the arguments in physical registers into virtual ones.
7019 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7020
7021 if (IsLocallyStreaming) {
7022 // LocallyStreamingFunctions must insert the SMSTART in the correct
7023 // position, so we use Glue to ensure no instructions can be scheduled
7024 // between the chain of:
7025 // t0: ch,glue = EntryNode
7026 // t1: res,ch,glue = CopyFromReg
7027 // ...
7028 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7029 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7030 // ^^^^^^
7031 // This will be the new Chain/Root node.
7032 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7033 Glue = ArgValue.getValue(2);
7034 } else
7035 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7036
7037 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7038 // to 64 bits. Insert an assert[sz]ext to capture this, then
7039 // truncate to the right size.
7040 switch (VA.getLocInfo()) {
7041 default:
7042 llvm_unreachable("Unknown loc info!");
7043 case CCValAssign::Full:
7044 break;
7046 assert(
7047 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7048 "Indirect arguments should be scalable on most subtargets");
7049 break;
7050 case CCValAssign::BCvt:
7051 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7052 break;
7053 case CCValAssign::AExt:
7054 case CCValAssign::SExt:
7055 case CCValAssign::ZExt:
7056 break;
7058 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7059 DAG.getConstant(32, DL, RegVT));
7060 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7061 break;
7062 }
7063 } else { // VA.isRegLoc()
7064 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7065 unsigned ArgOffset = VA.getLocMemOffset();
7066 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7067 ? VA.getLocVT().getSizeInBits()
7068 : VA.getValVT().getSizeInBits()) / 8;
7069
7070 uint32_t BEAlign = 0;
7071 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7072 !Ins[i].Flags.isInConsecutiveRegs())
7073 BEAlign = 8 - ArgSize;
7074
7075 SDValue FIN;
7076 MachinePointerInfo PtrInfo;
7077 if (StackViaX4) {
7078 // In both the ARM64EC varargs convention and the thunk convention,
7079 // arguments on the stack are accessed relative to x4, not sp. In
7080 // the thunk convention, there's an additional offset of 32 bytes
7081 // to account for the shadow store.
7082 unsigned ObjOffset = ArgOffset + BEAlign;
7083 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7084 ObjOffset += 32;
7085 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7086 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7087 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7088 DAG.getConstant(ObjOffset, DL, MVT::i64));
7090 } else {
7091 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7092
7093 // Create load nodes to retrieve arguments from the stack.
7094 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7095 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7096 }
7097
7098 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7100 MVT MemVT = VA.getValVT();
7101
7102 switch (VA.getLocInfo()) {
7103 default:
7104 break;
7105 case CCValAssign::Trunc:
7106 case CCValAssign::BCvt:
7107 MemVT = VA.getLocVT();
7108 break;
7111 Subtarget->isWindowsArm64EC()) &&
7112 "Indirect arguments should be scalable on most subtargets");
7113 MemVT = VA.getLocVT();
7114 break;
7115 case CCValAssign::SExt:
7116 ExtType = ISD::SEXTLOAD;
7117 break;
7118 case CCValAssign::ZExt:
7119 ExtType = ISD::ZEXTLOAD;
7120 break;
7121 case CCValAssign::AExt:
7122 ExtType = ISD::EXTLOAD;
7123 break;
7124 }
7125
7126 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7127 MemVT);
7128 }
7129
7130 if (VA.getLocInfo() == CCValAssign::Indirect) {
7131 assert((VA.getValVT().isScalableVT() ||
7132 Subtarget->isWindowsArm64EC()) &&
7133 "Indirect arguments should be scalable on most subtargets");
7134
7135 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7136 unsigned NumParts = 1;
7137 if (Ins[i].Flags.isInConsecutiveRegs()) {
7138 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7139 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7140 ++NumParts;
7141 }
7142
7143 MVT PartLoad = VA.getValVT();
7144 SDValue Ptr = ArgValue;
7145
7146 // Ensure we generate all loads for each tuple part, whilst updating the
7147 // pointer after each load correctly using vscale.
7148 while (NumParts > 0) {
7149 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7150 InVals.push_back(ArgValue);
7151 NumParts--;
7152 if (NumParts > 0) {
7153 SDValue BytesIncrement;
7154 if (PartLoad.isScalableVector()) {
7155 BytesIncrement = DAG.getVScale(
7156 DL, Ptr.getValueType(),
7157 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7158 } else {
7159 BytesIncrement = DAG.getConstant(
7160 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7161 Ptr.getValueType());
7162 }
7164 Flags.setNoUnsignedWrap(true);
7165 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7166 BytesIncrement, Flags);
7167 ExtraArgLocs++;
7168 i++;
7169 }
7170 }
7171 } else {
7172 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7173 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7174 ArgValue, DAG.getValueType(MVT::i32));
7175
7176 // i1 arguments are zero-extended to i8 by the caller. Emit a
7177 // hint to reflect this.
7178 if (Ins[i].isOrigArg()) {
7179 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7180 if (OrigArg->getType()->isIntegerTy(1)) {
7181 if (!Ins[i].Flags.isZExt()) {
7182 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7183 ArgValue.getValueType(), ArgValue);
7184 }
7185 }
7186 }
7187
7188 InVals.push_back(ArgValue);
7189 }
7190 }
7191 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7192
7193 // Insert the SMSTART if this is a locally streaming function and
7194 // make sure it is Glued to the last CopyFromReg value.
7195 if (IsLocallyStreaming) {
7196 SDValue PStateSM;
7197 if (Attrs.hasStreamingCompatibleInterface()) {
7198 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7201 FuncInfo->setPStateSMReg(Reg);
7202 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7203 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7205 } else
7206 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7208
7209 // Ensure that the SMSTART happens after the CopyWithChain such that its
7210 // chain result is used.
7211 for (unsigned I=0; I<InVals.size(); ++I) {
7213 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7214 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7215 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7216 InVals[I].getValueType());
7217 }
7218 }
7219
7220 // varargs
7221 if (isVarArg) {
7222 if (!Subtarget->isTargetDarwin() || IsWin64) {
7223 // The AAPCS variadic function ABI is identical to the non-variadic
7224 // one. As a result there may be more arguments in registers and we should
7225 // save them for future reference.
7226 // Win64 variadic functions also pass arguments in registers, but all float
7227 // arguments are passed in integer registers.
7228 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7229 }
7230
7231 // This will point to the next argument passed via stack.
7232 unsigned VarArgsOffset = CCInfo.getStackSize();
7233 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7234 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7235 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7236 FuncInfo->setVarArgsStackIndex(
7237 MFI.CreateFixedObject(4, VarArgsOffset, true));
7238
7239 if (MFI.hasMustTailInVarArgFunc()) {
7240 SmallVector<MVT, 2> RegParmTypes;
7241 RegParmTypes.push_back(MVT::i64);
7242 RegParmTypes.push_back(MVT::f128);
7243 // Compute the set of forwarded registers. The rest are scratch.
7245 FuncInfo->getForwardedMustTailRegParms();
7246 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7248
7249 // Conservatively forward X8, since it might be used for aggregate return.
7250 if (!CCInfo.isAllocated(AArch64::X8)) {
7251 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7252 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7253 }
7254 }
7255 }
7256
7257 // On Windows, InReg pointers must be returned, so record the pointer in a
7258 // virtual register at the start of the function so it can be returned in the
7259 // epilogue.
7260 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7261 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7262 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7263 Ins[I].Flags.isInReg()) &&
7264 Ins[I].Flags.isSRet()) {
7265 assert(!FuncInfo->getSRetReturnReg());
7266
7267 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7268 Register Reg =
7270 FuncInfo->setSRetReturnReg(Reg);
7271
7272 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7273 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7274 break;
7275 }
7276 }
7277 }
7278
7279 unsigned StackArgSize = CCInfo.getStackSize();
7280 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7281 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7282 // This is a non-standard ABI so by fiat I say we're allowed to make full
7283 // use of the stack area to be popped, which must be aligned to 16 bytes in
7284 // any case:
7285 StackArgSize = alignTo(StackArgSize, 16);
7286
7287 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7288 // a multiple of 16.
7289 FuncInfo->setArgumentStackToRestore(StackArgSize);
7290
7291 // This realignment carries over to the available bytes below. Our own
7292 // callers will guarantee the space is free by giving an aligned value to
7293 // CALLSEQ_START.
7294 }
7295 // Even if we're not expected to free up the space, it's useful to know how
7296 // much is there while considering tail calls (because we can reuse it).
7297 FuncInfo->setBytesInStackArgArea(StackArgSize);
7298
7299 if (Subtarget->hasCustomCallingConv())
7301
7302 // Conservatively assume the function requires the lazy-save mechanism.
7303 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7304 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7305 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7306 }
7307
7308 return Chain;
7309}
7310
7311void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7312 SelectionDAG &DAG,
7313 const SDLoc &DL,
7314 SDValue &Chain) const {
7316 MachineFrameInfo &MFI = MF.getFrameInfo();
7318 auto PtrVT = getPointerTy(DAG.getDataLayout());
7319 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7320
7322
7324 unsigned NumGPRArgRegs = GPRArgRegs.size();
7325 if (Subtarget->isWindowsArm64EC()) {
7326 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7327 // functions.
7328 NumGPRArgRegs = 4;
7329 }
7330 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7331
7332 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7333 int GPRIdx = 0;
7334 if (GPRSaveSize != 0) {
7335 if (IsWin64) {
7336 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7337 if (GPRSaveSize & 15)
7338 // The extra size here, if triggered, will always be 8.
7339 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7340 } else
7341 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7342
7343 SDValue FIN;
7344 if (Subtarget->isWindowsArm64EC()) {
7345 // With the Arm64EC ABI, we reserve the save area as usual, but we
7346 // compute its address relative to x4. For a normal AArch64->AArch64
7347 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7348 // different address.
7349 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7350 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7351 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7352 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7353 } else {
7354 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7355 }
7356
7357 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7358 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7359 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7360 SDValue Store =
7361 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7363 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7364 : MachinePointerInfo::getStack(MF, i * 8));
7365 MemOps.push_back(Store);
7366 FIN =
7367 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7368 }
7369 }
7370 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7371 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7372
7373 if (Subtarget->hasFPARMv8() && !IsWin64) {
7375 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7376 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7377
7378 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7379 int FPRIdx = 0;
7380 if (FPRSaveSize != 0) {
7381 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7382
7383 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7384
7385 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7386 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7387 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7388
7389 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7390 MachinePointerInfo::getStack(MF, i * 16));
7391 MemOps.push_back(Store);
7392 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7393 DAG.getConstant(16, DL, PtrVT));
7394 }
7395 }
7396 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7397 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7398 }
7399
7400 if (!MemOps.empty()) {
7401 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7402 }
7403}
7404
7405static bool isPassedInFPR(EVT VT) {
7406 return VT.isFixedLengthVector() ||
7407 (VT.isFloatingPoint() && !VT.isScalableVector());
7408}
7409
7410/// LowerCallResult - Lower the result values of a call into the
7411/// appropriate copies out of appropriate physical registers.
7412SDValue AArch64TargetLowering::LowerCallResult(
7413 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7414 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7415 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7416 SDValue ThisVal, bool RequiresSMChange) const {
7417 DenseMap<unsigned, SDValue> CopiedRegs;
7418 // Copy all of the result registers out of their specified physreg.
7419 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7420 CCValAssign VA = RVLocs[i];
7421
7422 // Pass 'this' value directly from the argument to return value, to avoid
7423 // reg unit interference
7424 if (i == 0 && isThisReturn) {
7425 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7426 "unexpected return calling convention register assignment");
7427 InVals.push_back(ThisVal);
7428 continue;
7429 }
7430
7431 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7432 // allows one use of a physreg per block.
7433 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7434 if (!Val) {
7435 Val =
7436 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7437 Chain = Val.getValue(1);
7438 InGlue = Val.getValue(2);
7439 CopiedRegs[VA.getLocReg()] = Val;
7440 }
7441
7442 switch (VA.getLocInfo()) {
7443 default:
7444 llvm_unreachable("Unknown loc info!");
7445 case CCValAssign::Full:
7446 break;
7447 case CCValAssign::BCvt:
7448 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7449 break;
7451 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7452 DAG.getConstant(32, DL, VA.getLocVT()));
7453 [[fallthrough]];
7454 case CCValAssign::AExt:
7455 [[fallthrough]];
7456 case CCValAssign::ZExt:
7457 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7458 break;
7459 }
7460
7461 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7463 Val);
7464
7465 InVals.push_back(Val);
7466 }
7467
7468 return Chain;
7469}
7470
7471/// Return true if the calling convention is one that we can guarantee TCO for.
7472static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7473 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7475}
7476
7477/// Return true if we might ever do TCO for calls with this calling convention.
7479 switch (CC) {
7480 case CallingConv::C:
7484 case CallingConv::Swift:
7486 case CallingConv::Tail:
7487 case CallingConv::Fast:
7488 return true;
7489 default:
7490 return false;
7491 }
7492}
7493
7495 const AArch64Subtarget *Subtarget,
7497 CCState &CCInfo) {
7498 const SelectionDAG &DAG = CLI.DAG;
7499 CallingConv::ID CalleeCC = CLI.CallConv;
7500 bool IsVarArg = CLI.IsVarArg;
7501 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7502 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7503
7504 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7505 // for the shadow store.
7506 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7507 CCInfo.AllocateStack(32, Align(16));
7508
7509 unsigned NumArgs = Outs.size();
7510 for (unsigned i = 0; i != NumArgs; ++i) {
7511 MVT ArgVT = Outs[i].VT;
7512 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7513
7514 bool UseVarArgCC = false;
7515 if (IsVarArg) {
7516 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7517 // too, so use the vararg CC to force them to integer registers.
7518 if (IsCalleeWin64) {
7519 UseVarArgCC = true;
7520 } else {
7521 UseVarArgCC = !Outs[i].IsFixed;
7522 }
7523 }
7524
7525 if (!UseVarArgCC) {
7526 // Get type of the original argument.
7527 EVT ActualVT =
7528 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7529 /*AllowUnknown*/ true);
7530 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7531 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7532 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7533 ArgVT = MVT::i8;
7534 else if (ActualMVT == MVT::i16)
7535 ArgVT = MVT::i16;
7536 }
7537
7538 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7539 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7540 assert(!Res && "Call operand has unhandled type");
7541 (void)Res;
7542 }
7543}
7544
7545bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7546 const CallLoweringInfo &CLI) const {
7547 CallingConv::ID CalleeCC = CLI.CallConv;
7548 if (!mayTailCallThisCC(CalleeCC))
7549 return false;
7550
7551 SDValue Callee = CLI.Callee;
7552 bool IsVarArg = CLI.IsVarArg;
7553 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7554 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7555 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7556 const SelectionDAG &DAG = CLI.DAG;
7558 const Function &CallerF = MF.getFunction();
7559 CallingConv::ID CallerCC = CallerF.getCallingConv();
7560
7561 // SME Streaming functions are not eligible for TCO as they may require
7562 // the streaming mode or ZA to be restored after returning from the call.
7563 SMEAttrs CallerAttrs(MF.getFunction());
7564 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7565 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7566 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7567 CallerAttrs.hasStreamingBody())
7568 return false;
7569
7570 // Functions using the C or Fast calling convention that have an SVE signature
7571 // preserve more registers and should assume the SVE_VectorCall CC.
7572 // The check for matching callee-saved regs will determine whether it is
7573 // eligible for TCO.
7574 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7577
7578 bool CCMatch = CallerCC == CalleeCC;
7579
7580 // When using the Windows calling convention on a non-windows OS, we want
7581 // to back up and restore X18 in such functions; we can't do a tail call
7582 // from those functions.
7583 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7584 CalleeCC != CallingConv::Win64)
7585 return false;
7586
7587 // Byval parameters hand the function a pointer directly into the stack area
7588 // we want to reuse during a tail call. Working around this *is* possible (see
7589 // X86) but less efficient and uglier in LowerCall.
7590 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7591 e = CallerF.arg_end();
7592 i != e; ++i) {
7593 if (i->hasByValAttr())
7594 return false;
7595
7596 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7597 // In this case, it is necessary to save/restore X0 in the callee. Tail
7598 // call opt interferes with this. So we disable tail call opt when the
7599 // caller has an argument with "inreg" attribute.
7600
7601 // FIXME: Check whether the callee also has an "inreg" argument.
7602 if (i->hasInRegAttr())
7603 return false;
7604 }
7605
7606 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7607 return CCMatch;
7608
7609 // Externally-defined functions with weak linkage should not be
7610 // tail-called on AArch64 when the OS does not support dynamic
7611 // pre-emption of symbols, as the AAELF spec requires normal calls
7612 // to undefined weak functions to be replaced with a NOP or jump to the
7613 // next instruction. The behaviour of branch instructions in this
7614 // situation (as used for tail calls) is implementation-defined, so we
7615 // cannot rely on the linker replacing the tail call with a return.
7616 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7617 const GlobalValue *GV = G->getGlobal();
7619 if (GV->hasExternalWeakLinkage() &&
7620 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7621 return false;
7622 }
7623
7624 // Now we search for cases where we can use a tail call without changing the
7625 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7626 // concept.
7627
7628 // I want anyone implementing a new calling convention to think long and hard
7629 // about this assert.
7630 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7631 "Unexpected variadic calling convention");
7632
7633 LLVMContext &C = *DAG.getContext();
7634 // Check that the call results are passed in the same way.
7635 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7636 CCAssignFnForCall(CalleeCC, IsVarArg),
7637 CCAssignFnForCall(CallerCC, IsVarArg)))
7638 return false;
7639 // The callee has to preserve all registers the caller needs to preserve.
7640 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7641 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7642 if (!CCMatch) {
7643 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7644 if (Subtarget->hasCustomCallingConv()) {
7645 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7646 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7647 }
7648 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7649 return false;
7650 }
7651
7652 // Nothing more to check if the callee is taking no arguments
7653 if (Outs.empty())
7654 return true;
7655
7657 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7658
7659 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7660
7661 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7662 // When we are musttail, additional checks have been done and we can safely ignore this check
7663 // At least two cases here: if caller is fastcc then we can't have any
7664 // memory arguments (we'd be expected to clean up the stack afterwards). If
7665 // caller is C then we could potentially use its argument area.
7666
7667 // FIXME: for now we take the most conservative of these in both cases:
7668 // disallow all variadic memory operands.
7669 for (const CCValAssign &ArgLoc : ArgLocs)
7670 if (!ArgLoc.isRegLoc())
7671 return false;
7672 }
7673
7674 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7675
7676 // If any of the arguments is passed indirectly, it must be SVE, so the
7677 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7678 // allocate space on the stack. That is why we determine this explicitly here
7679 // the call cannot be a tailcall.
7680 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7681 assert((A.getLocInfo() != CCValAssign::Indirect ||
7682 A.getValVT().isScalableVector() ||
7683 Subtarget->isWindowsArm64EC()) &&
7684 "Expected value to be scalable");
7685 return A.getLocInfo() == CCValAssign::Indirect;
7686 }))
7687 return false;
7688
7689 // If the stack arguments for this call do not fit into our own save area then
7690 // the call cannot be made tail.
7691 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7692 return false;
7693
7694 const MachineRegisterInfo &MRI = MF.getRegInfo();
7695 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7696 return false;
7697
7698 return true;
7699}
7700
7701SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7702 SelectionDAG &DAG,
7703 MachineFrameInfo &MFI,
7704 int ClobberedFI) const {
7705 SmallVector<SDValue, 8> ArgChains;
7706 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7707 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7708
7709 // Include the original chain at the beginning of the list. When this is
7710 // used by target LowerCall hooks, this helps legalize find the
7711 // CALLSEQ_BEGIN node.
7712 ArgChains.push_back(Chain);
7713
7714 // Add a chain value for each stack argument corresponding
7715 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7716 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7717 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7718 if (FI->getIndex() < 0) {
7719 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7720 int64_t InLastByte = InFirstByte;
7721 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7722
7723 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7724 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7725 ArgChains.push_back(SDValue(L, 1));
7726 }
7727
7728 // Build a tokenfactor for all the chains.
7729 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7730}
7731
7732bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7733 bool TailCallOpt) const {
7734 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7735 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7736}
7737
7738// Check if the value is zero-extended from i1 to i8
7739static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7740 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7741 if (SizeInBits < 8)
7742 return false;
7743
7744 APInt RequredZero(SizeInBits, 0xFE);
7745 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7746 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7747 return ZExtBool;
7748}
7749
7750void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7751 SDNode *Node) const {
7752 // Live-in physreg copies that are glued to SMSTART are applied as
7753 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7754 // register allocator to pass call args in callee saved regs, without extra
7755 // copies to avoid these fake clobbers of actually-preserved GPRs.
7756 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7757 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7758 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7759 if (MachineOperand &MO = MI.getOperand(I);
7760 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7761 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7762 AArch64::GPR64RegClass.contains(MO.getReg())))
7763 MI.removeOperand(I);
7764
7765 // The SVE vector length can change when entering/leaving streaming mode.
7766 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7767 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7768 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7769 /*IsImplicit=*/true));
7770 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7771 /*IsImplicit=*/true));
7772 }
7773 }
7774
7775 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7776 // have nothing to do with VG, were it not that they are used to materialise a
7777 // frame-address. If they contain a frame-index to a scalable vector, this
7778 // will likely require an ADDVL instruction to materialise the address, thus
7779 // reading VG.
7780 const MachineFunction &MF = *MI.getMF();
7782 (MI.getOpcode() == AArch64::ADDXri ||
7783 MI.getOpcode() == AArch64::SUBXri)) {
7784 const MachineOperand &MO = MI.getOperand(1);
7785 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7787 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7788 /*IsImplicit=*/true));
7789 }
7790}
7791
7793 bool Enable, SDValue Chain,
7794 SDValue InGlue,
7795 unsigned Condition,
7796 SDValue PStateSM) const {
7799 FuncInfo->setHasStreamingModeChanges(true);
7800
7801 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7802 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7803 SDValue MSROp =
7804 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7805 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7806 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7807 if (Condition != AArch64SME::Always) {
7808 assert(PStateSM && "PStateSM should be defined");
7809 Ops.push_back(PStateSM);
7810 }
7811 Ops.push_back(RegMask);
7812
7813 if (InGlue)
7814 Ops.push_back(InGlue);
7815
7816 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7817 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7818}
7819
7820static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7821 const SMEAttrs &CalleeAttrs) {
7822 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7823 CallerAttrs.hasStreamingBody())
7824 return AArch64SME::Always;
7825 if (CalleeAttrs.hasNonStreamingInterface())
7827 if (CalleeAttrs.hasStreamingInterface())
7829
7830 llvm_unreachable("Unsupported attributes");
7831}
7832
7833/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7834/// and add input and output parameter nodes.
7835SDValue
7836AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7837 SmallVectorImpl<SDValue> &InVals) const {
7838 SelectionDAG &DAG = CLI.DAG;
7839 SDLoc &DL = CLI.DL;
7840 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7841 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7843 SDValue Chain = CLI.Chain;
7844 SDValue Callee = CLI.Callee;
7845 bool &IsTailCall = CLI.IsTailCall;
7846 CallingConv::ID &CallConv = CLI.CallConv;
7847 bool IsVarArg = CLI.IsVarArg;
7848
7851 bool IsThisReturn = false;
7852
7854 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7855 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7856 bool IsSibCall = false;
7857 bool GuardWithBTI = false;
7858
7859 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7860 !Subtarget->noBTIAtReturnTwice()) {
7861 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7862 }
7863
7864 // Analyze operands of the call, assigning locations to each operand.
7866 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7867
7868 if (IsVarArg) {
7869 unsigned NumArgs = Outs.size();
7870
7871 for (unsigned i = 0; i != NumArgs; ++i) {
7872 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7873 report_fatal_error("Passing SVE types to variadic functions is "
7874 "currently not supported");
7875 }
7876 }
7877
7878 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7879
7880 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7881 // Assign locations to each value returned by this call.
7883 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7884 *DAG.getContext());
7885 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7886
7887 // Check callee args/returns for SVE registers and set calling convention
7888 // accordingly.
7889 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7890 auto HasSVERegLoc = [](CCValAssign &Loc) {
7891 if (!Loc.isRegLoc())
7892 return false;
7893 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7894 AArch64::PPRRegClass.contains(Loc.getLocReg());
7895 };
7896 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7898 }
7899
7900 if (IsTailCall) {
7901 // Check if it's really possible to do a tail call.
7902 IsTailCall = isEligibleForTailCallOptimization(CLI);
7903
7904 // A sibling call is one where we're under the usual C ABI and not planning
7905 // to change that but can still do a tail call:
7906 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7907 CallConv != CallingConv::SwiftTail)
7908 IsSibCall = true;
7909
7910 if (IsTailCall)
7911 ++NumTailCalls;
7912 }
7913
7914 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7915 report_fatal_error("failed to perform tail call elimination on a call "
7916 "site marked musttail");
7917
7918 // Get a count of how many bytes are to be pushed on the stack.
7919 unsigned NumBytes = CCInfo.getStackSize();
7920
7921 if (IsSibCall) {
7922 // Since we're not changing the ABI to make this a tail call, the memory
7923 // operands are already available in the caller's incoming argument space.
7924 NumBytes = 0;
7925 }
7926
7927 // FPDiff is the byte offset of the call's argument area from the callee's.
7928 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7929 // by this amount for a tail call. In a sibling call it must be 0 because the
7930 // caller will deallocate the entire stack and the callee still expects its
7931 // arguments to begin at SP+0. Completely unused for non-tail calls.
7932 int FPDiff = 0;
7933
7934 if (IsTailCall && !IsSibCall) {
7935 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7936
7937 // Since callee will pop argument stack as a tail call, we must keep the
7938 // popped size 16-byte aligned.
7939 NumBytes = alignTo(NumBytes, 16);
7940
7941 // FPDiff will be negative if this tail call requires more space than we
7942 // would automatically have in our incoming argument space. Positive if we
7943 // can actually shrink the stack.
7944 FPDiff = NumReusableBytes - NumBytes;
7945
7946 // Update the required reserved area if this is the tail call requiring the
7947 // most argument stack space.
7948 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7949 FuncInfo->setTailCallReservedStack(-FPDiff);
7950
7951 // The stack pointer must be 16-byte aligned at all times it's used for a
7952 // memory operation, which in practice means at *all* times and in
7953 // particular across call boundaries. Therefore our own arguments started at
7954 // a 16-byte aligned SP and the delta applied for the tail call should
7955 // satisfy the same constraint.
7956 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7957 }
7958
7959 // Determine whether we need any streaming mode changes.
7960 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7961 if (CLI.CB)
7962 CalleeAttrs = SMEAttrs(*CLI.CB);
7963 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7964 CalleeAttrs = SMEAttrs(ES->getSymbol());
7965
7966 auto DescribeCallsite =
7968 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7969 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7970 R << ore::NV("Callee", ES->getSymbol());
7971 else if (CLI.CB && CLI.CB->getCalledFunction())
7972 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7973 else
7974 R << "unknown callee";
7975 R << "'";
7976 return R;
7977 };
7978
7979 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7980 if (RequiresLazySave) {
7981 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7983 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
7985 SDValue NumZaSaveSlicesAddr =
7986 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7987 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7988 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7989 DAG.getConstant(1, DL, MVT::i32));
7990 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7991 MPI, MVT::i16);
7992 Chain = DAG.getNode(
7993 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7994 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7995 TPIDR2ObjAddr);
7997 ORE.emit([&]() {
7998 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7999 CLI.CB)
8000 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8001 &MF.getFunction());
8002 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8003 });
8004 }
8005
8006 SDValue PStateSM;
8007 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8008 if (RequiresSMChange) {
8009 if (CallerAttrs.hasStreamingInterfaceOrBody())
8010 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8011 else if (CallerAttrs.hasNonStreamingInterface())
8012 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8013 else
8014 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8016 ORE.emit([&]() {
8017 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8018 CLI.CB)
8019 : OptimizationRemarkAnalysis("sme", "SMETransition",
8020 &MF.getFunction());
8021 DescribeCallsite(R) << " requires a streaming mode transition";
8022 return R;
8023 });
8024 }
8025
8026 SDValue ZTFrameIdx;
8027 MachineFrameInfo &MFI = MF.getFrameInfo();
8028 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8029
8030 // If the caller has ZT0 state which will not be preserved by the callee,
8031 // spill ZT0 before the call.
8032 if (ShouldPreserveZT0) {
8033 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8034 ZTFrameIdx = DAG.getFrameIndex(
8035 ZTObj,
8037
8038 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8039 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8040 }
8041
8042 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8043 // PSTATE.ZA before the call if there is no lazy-save active.
8044 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8045 assert((!DisableZA || !RequiresLazySave) &&
8046 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8047
8048 if (DisableZA)
8049 Chain = DAG.getNode(
8050 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8051 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8052 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8053
8054 // Adjust the stack pointer for the new arguments...
8055 // These operations are automatically eliminated by the prolog/epilog pass
8056 if (!IsSibCall)
8057 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8058
8059 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8061
8063 SmallSet<unsigned, 8> RegsUsed;
8064 SmallVector<SDValue, 8> MemOpChains;
8065 auto PtrVT = getPointerTy(DAG.getDataLayout());
8066
8067 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8068 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8069 for (const auto &F : Forwards) {
8070 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8071 RegsToPass.emplace_back(F.PReg, Val);
8072 }
8073 }
8074
8075 // Walk the register/memloc assignments, inserting copies/loads.
8076 unsigned ExtraArgLocs = 0;
8077 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8078 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8079 SDValue Arg = OutVals[i];
8080 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8081
8082 // Promote the value if needed.
8083 switch (VA.getLocInfo()) {
8084 default:
8085 llvm_unreachable("Unknown loc info!");
8086 case CCValAssign::Full:
8087 break;
8088 case CCValAssign::SExt:
8089 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8090 break;
8091 case CCValAssign::ZExt:
8092 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8093 break;
8094 case CCValAssign::AExt:
8095 if (Outs[i].ArgVT == MVT::i1) {
8096 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8097 //
8098 // Check if we actually have to do this, because the value may
8099 // already be zero-extended.
8100 //
8101 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8102 // and rely on DAGCombiner to fold this, because the following
8103 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8104 //
8105 // (ext (zext x)) -> (zext x)
8106 //
8107 // This will give us (zext i32), which we cannot remove, so
8108 // try to check this beforehand.
8109 if (!checkZExtBool(Arg, DAG)) {
8110 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8111 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8112 }
8113 }
8114 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8115 break;
8117 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8118 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8119 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8120 DAG.getConstant(32, DL, VA.getLocVT()));
8121 break;
8122 case CCValAssign::BCvt:
8123 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8124 break;
8125 case CCValAssign::Trunc:
8126 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8127 break;
8128 case CCValAssign::FPExt:
8129 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8130 break;
8132 bool isScalable = VA.getValVT().isScalableVT();
8133 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8134 "Indirect arguments should be scalable on most subtargets");
8135
8136 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8137 uint64_t PartSize = StoreSize;
8138 unsigned NumParts = 1;
8139 if (Outs[i].Flags.isInConsecutiveRegs()) {
8140 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8141 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8142 ++NumParts;
8143 StoreSize *= NumParts;
8144 }
8145
8146 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8147 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8148 MachineFrameInfo &MFI = MF.getFrameInfo();
8149 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8150 if (isScalable)
8152
8156 SDValue SpillSlot = Ptr;
8157
8158 // Ensure we generate all stores for each tuple part, whilst updating the
8159 // pointer after each store correctly using vscale.
8160 while (NumParts) {
8161 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8162 MemOpChains.push_back(Store);
8163
8164 NumParts--;
8165 if (NumParts > 0) {
8166 SDValue BytesIncrement;
8167 if (isScalable) {
8168 BytesIncrement = DAG.getVScale(
8169 DL, Ptr.getValueType(),
8170 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8171 } else {
8172 BytesIncrement = DAG.getConstant(
8173 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8174 Ptr.getValueType());
8175 }
8177 Flags.setNoUnsignedWrap(true);
8178
8179 MPI = MachinePointerInfo(MPI.getAddrSpace());
8180 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8181 BytesIncrement, Flags);
8182 ExtraArgLocs++;
8183 i++;
8184 }
8185 }
8186
8187 Arg = SpillSlot;
8188 break;
8189 }
8190
8191 if (VA.isRegLoc()) {
8192 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8193 Outs[0].VT == MVT::i64) {
8194 assert(VA.getLocVT() == MVT::i64 &&
8195 "unexpected calling convention register assignment");
8196 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8197 "unexpected use of 'returned'");
8198 IsThisReturn = true;
8199 }
8200 if (RegsUsed.count(VA.getLocReg())) {
8201 // If this register has already been used then we're trying to pack
8202 // parts of an [N x i32] into an X-register. The extension type will
8203 // take care of putting the two halves in the right place but we have to
8204 // combine them.
8205 SDValue &Bits =
8206 llvm::find_if(RegsToPass,
8207 [=](const std::pair<unsigned, SDValue> &Elt) {
8208 return Elt.first == VA.getLocReg();
8209 })
8210 ->second;
8211 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8212 // Call site info is used for function's parameter entry value
8213 // tracking. For now we track only simple cases when parameter
8214 // is transferred through whole register.
8215 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
8216 return ArgReg.Reg == VA.getLocReg();
8217 });
8218 } else {
8219 // Add an extra level of indirection for streaming mode changes by
8220 // using a pseudo copy node that cannot be rematerialised between a
8221 // smstart/smstop and the call by the simple register coalescer.
8222 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8224 Arg.getValueType(), Arg);
8225 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8226 RegsUsed.insert(VA.getLocReg());
8227 const TargetOptions &Options = DAG.getTarget().Options;
8228 if (Options.EmitCallSiteInfo)
8229 CSInfo.emplace_back(VA.getLocReg(), i);
8230 }
8231 } else {
8232 assert(VA.isMemLoc());
8233
8234 SDValue DstAddr;
8235 MachinePointerInfo DstInfo;
8236
8237 // FIXME: This works on big-endian for composite byvals, which are the
8238 // common case. It should also work for fundamental types too.
8239 uint32_t BEAlign = 0;
8240 unsigned OpSize;
8241 if (VA.getLocInfo() == CCValAssign::Indirect ||
8243 OpSize = VA.getLocVT().getFixedSizeInBits();
8244 else
8245 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8246 : VA.getValVT().getSizeInBits();
8247 OpSize = (OpSize + 7) / 8;
8248 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8249 !Flags.isInConsecutiveRegs()) {
8250 if (OpSize < 8)
8251 BEAlign = 8 - OpSize;
8252 }
8253 unsigned LocMemOffset = VA.getLocMemOffset();
8254 int32_t Offset = LocMemOffset + BEAlign;
8255 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8256 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8257
8258 if (IsTailCall) {
8259 Offset = Offset + FPDiff;
8260 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8261
8262 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8263 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8264
8265 // Make sure any stack arguments overlapping with where we're storing
8266 // are loaded before this eventual operation. Otherwise they'll be
8267 // clobbered.
8268 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8269 } else {
8270 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8271
8272 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8273 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8274 }
8275
8276 if (Outs[i].Flags.isByVal()) {
8277 SDValue SizeNode =
8278 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8279 SDValue Cpy = DAG.getMemcpy(
8280 Chain, DL, DstAddr, Arg, SizeNode,
8281 Outs[i].Flags.getNonZeroByValAlign(),
8282 /*isVol = */ false, /*AlwaysInline = */ false,
8283 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8284
8285 MemOpChains.push_back(Cpy);
8286 } else {
8287 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8288 // promoted to a legal register type i32, we should truncate Arg back to
8289 // i1/i8/i16.
8290 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8291 VA.getValVT() == MVT::i16)
8292 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8293
8294 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8295 MemOpChains.push_back(Store);
8296 }
8297 }
8298 }
8299
8300 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8301 SDValue ParamPtr = StackPtr;
8302 if (IsTailCall) {
8303 // Create a dummy object at the top of the stack that can be used to get
8304 // the SP after the epilogue
8305 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8306 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8307 }
8308
8309 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8310 // describing the argument list. x4 contains the address of the
8311 // first stack parameter. x5 contains the size in bytes of all parameters
8312 // passed on the stack.
8313 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8314 RegsToPass.emplace_back(AArch64::X5,
8315 DAG.getConstant(NumBytes, DL, MVT::i64));
8316 }
8317
8318 if (!MemOpChains.empty())
8319 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8320
8321 SDValue InGlue;
8322 if (RequiresSMChange) {
8323 SDValue NewChain = changeStreamingMode(
8324 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8325 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8326 Chain = NewChain.getValue(0);
8327 InGlue = NewChain.getValue(1);
8328 }
8329
8330 // Build a sequence of copy-to-reg nodes chained together with token chain
8331 // and flag operands which copy the outgoing args into the appropriate regs.
8332 for (auto &RegToPass : RegsToPass) {
8333 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8334 RegToPass.second, InGlue);
8335 InGlue = Chain.getValue(1);
8336 }
8337
8338 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8339 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8340 // node so that legalize doesn't hack it.
8341 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8342 auto GV = G->getGlobal();
8343 unsigned OpFlags =
8345 if (OpFlags & AArch64II::MO_GOT) {
8346 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8347 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8348 } else {
8349 const GlobalValue *GV = G->getGlobal();
8350 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8351 }
8352 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8353 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8354 Subtarget->isTargetMachO()) ||
8356 const char *Sym = S->getSymbol();
8357 if (UseGot) {
8359 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8360 } else {
8361 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8362 }
8363 }
8364
8365 // We don't usually want to end the call-sequence here because we would tidy
8366 // the frame up *after* the call, however in the ABI-changing tail-call case
8367 // we've carefully laid out the parameters so that when sp is reset they'll be
8368 // in the correct location.
8369 if (IsTailCall && !IsSibCall) {
8370 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8371 InGlue = Chain.getValue(1);
8372 }
8373
8374 std::vector<SDValue> Ops;
8375 Ops.push_back(Chain);
8376 Ops.push_back(Callee);
8377
8378 if (IsTailCall) {
8379 // Each tail call may have to adjust the stack by a different amount, so
8380 // this information must travel along with the operation for eventual
8381 // consumption by emitEpilogue.
8382 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8383 }
8384
8385 // Add argument registers to the end of the list so that they are known live
8386 // into the call.
8387 for (auto &RegToPass : RegsToPass)
8388 Ops.push_back(DAG.getRegister(RegToPass.first,
8389 RegToPass.second.getValueType()));
8390
8391 // Add a register mask operand representing the call-preserved registers.
8392 const uint32_t *Mask;
8393 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8394 if (IsThisReturn) {
8395 // For 'this' returns, use the X0-preserving mask if applicable
8396 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8397 if (!Mask) {
8398 IsThisReturn = false;
8399 Mask = TRI->getCallPreservedMask(MF, CallConv);
8400 }
8401 } else
8402 Mask = TRI->getCallPreservedMask(MF, CallConv);
8403
8404 if (Subtarget->hasCustomCallingConv())
8405 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8406
8407 if (TRI->isAnyArgRegReserved(MF))
8408 TRI->emitReservedArgRegCallError(MF);
8409
8410 assert(Mask && "Missing call preserved mask for calling convention");
8411 Ops.push_back(DAG.getRegisterMask(Mask));
8412
8413 if (InGlue.getNode())
8414 Ops.push_back(InGlue);
8415
8416 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8417
8418 // If we're doing a tall call, use a TC_RETURN here rather than an
8419 // actual call instruction.
8420 if (IsTailCall) {
8422 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8423
8424 if (IsCFICall)
8425 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8426
8427 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8428 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8429 return Ret;
8430 }
8431
8432 unsigned CallOpc = AArch64ISD::CALL;
8433 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8434 // be expanded to the call, directly followed by a special marker sequence and
8435 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8436 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8437 assert(!IsTailCall &&
8438 "tail calls cannot be marked with clang.arc.attachedcall");
8439 CallOpc = AArch64ISD::CALL_RVMARKER;
8440
8441 // Add a target global address for the retainRV/claimRV runtime function
8442 // just before the call target.
8443 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8444 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8445 Ops.insert(Ops.begin() + 1, GA);
8446 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8448 } else if (GuardWithBTI) {
8449 CallOpc = AArch64ISD::CALL_BTI;
8450 }
8451
8452 // Returns a chain and a flag for retval copy to use.
8453 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8454
8455 if (IsCFICall)
8456 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8457
8458 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8459 InGlue = Chain.getValue(1);
8460 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8461
8462 uint64_t CalleePopBytes =
8463 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8464
8465 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8466 InGlue = Chain.getValue(1);
8467
8468 // Handle result values, copying them out of physregs into vregs that we
8469 // return.
8470 SDValue Result = LowerCallResult(
8471 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8472 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8473
8474 if (!Ins.empty())
8475 InGlue = Result.getValue(Result->getNumValues() - 1);
8476
8477 if (RequiresSMChange) {
8478 assert(PStateSM && "Expected a PStateSM to be set");
8480 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8481 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8482 }
8483
8484 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8485 // Unconditionally resume ZA.
8486 Result = DAG.getNode(
8487 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8488 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8489 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8490
8491 if (ShouldPreserveZT0)
8492 Result =
8493 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8494 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8495
8496 if (RequiresLazySave) {
8497 // Conditionally restore the lazy save using a pseudo node.
8498 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8499 SDValue RegMask = DAG.getRegisterMask(
8500 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8501 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8502 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8503 SDValue TPIDR2_EL0 = DAG.getNode(
8504 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8505 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8506
8507 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8508 // RESTORE_ZA pseudo.
8509 SDValue Glue;
8510 SDValue TPIDR2Block = DAG.getFrameIndex(
8512 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8513 Result =
8514 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8515 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8516 RestoreRoutine, RegMask, Result.getValue(1)});
8517
8518 // Finally reset the TPIDR2_EL0 register to 0.
8519 Result = DAG.getNode(
8520 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8521 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8522 DAG.getConstant(0, DL, MVT::i64));
8523 }
8524
8525 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8526 for (unsigned I = 0; I < InVals.size(); ++I) {
8527 // The smstart/smstop is chained as part of the call, but when the
8528 // resulting chain is discarded (which happens when the call is not part
8529 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8530 // smstart/smstop is chained to the result value. We can do that by doing
8531 // a vreg -> vreg copy.
8533 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8534 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8535 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8536 InVals[I].getValueType());
8537 }
8538 }
8539
8540 return Result;
8541}
8542
8543bool AArch64TargetLowering::CanLowerReturn(
8544 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8545 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8546 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8548 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8549 return CCInfo.CheckReturn(Outs, RetCC);
8550}
8551
8552SDValue
8553AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8554 bool isVarArg,
8556 const SmallVectorImpl<SDValue> &OutVals,
8557 const SDLoc &DL, SelectionDAG &DAG) const {
8558 auto &MF = DAG.getMachineFunction();
8559 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8560
8561 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8563 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8564 CCInfo.AnalyzeReturn(Outs, RetCC);
8565
8566 // Copy the result values into the output registers.
8567 SDValue Glue;
8569 SmallSet<unsigned, 4> RegsUsed;
8570 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8571 ++i, ++realRVLocIdx) {
8572 CCValAssign &VA = RVLocs[i];
8573 assert(VA.isRegLoc() && "Can only return in registers!");
8574 SDValue Arg = OutVals[realRVLocIdx];
8575
8576 switch (VA.getLocInfo()) {
8577 default:
8578 llvm_unreachable("Unknown loc info!");
8579 case CCValAssign::Full:
8580 if (Outs[i].ArgVT == MVT::i1) {
8581 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8582 // value. This is strictly redundant on Darwin (which uses "zeroext
8583 // i1"), but will be optimised out before ISel.
8584 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8585 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8586 }
8587 break;
8588 case CCValAssign::BCvt:
8589 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8590 break;
8591 case CCValAssign::AExt:
8592 case CCValAssign::ZExt:
8593 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8594 break;
8596 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8597 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8598 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8599 DAG.getConstant(32, DL, VA.getLocVT()));
8600 break;
8601 }
8602
8603 if (RegsUsed.count(VA.getLocReg())) {
8604 SDValue &Bits =
8605 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8606 return Elt.first == VA.getLocReg();
8607 })->second;
8608 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8609 } else {
8610 RetVals.emplace_back(VA.getLocReg(), Arg);
8611 RegsUsed.insert(VA.getLocReg());
8612 }
8613 }
8614
8615 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8616
8617 // Emit SMSTOP before returning from a locally streaming function
8618 SMEAttrs FuncAttrs(MF.getFunction());
8619 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8620 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8621 Register Reg = FuncInfo->getPStateSMReg();
8622 assert(Reg.isValid() && "PStateSM Register is invalid");
8623 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8624 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8625 /*Glue*/ SDValue(),
8627 } else
8628 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8629 /*Glue*/ SDValue(), AArch64SME::Always);
8630 Glue = Chain.getValue(1);
8631 }
8632
8633 SmallVector<SDValue, 4> RetOps(1, Chain);
8634 for (auto &RetVal : RetVals) {
8635 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8636 Glue = Chain.getValue(1);
8637 RetOps.push_back(
8638 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8639 }
8640
8641 // Windows AArch64 ABIs require that for returning structs by value we copy
8642 // the sret argument into X0 for the return.
8643 // We saved the argument into a virtual register in the entry block,
8644 // so now we copy the value out and into X0.
8645 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8646 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8648
8649 unsigned RetValReg = AArch64::X0;
8650 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8651 RetValReg = AArch64::X8;
8652 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8653 Glue = Chain.getValue(1);
8654
8655 RetOps.push_back(
8656 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8657 }
8658
8659 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8660 if (I) {
8661 for (; *I; ++I) {
8662 if (AArch64::GPR64RegClass.contains(*I))
8663 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8664 else if (AArch64::FPR64RegClass.contains(*I))
8665 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8666 else
8667 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8668 }
8669 }
8670
8671 RetOps[0] = Chain; // Update chain.
8672
8673 // Add the glue if we have it.
8674 if (Glue.getNode())
8675 RetOps.push_back(Glue);
8676
8677 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8678 // ARM64EC entry thunks use a special return sequence: instead of a regular
8679 // "ret" instruction, they need to explicitly call the emulator.
8680 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8681 SDValue Arm64ECRetDest =
8682 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8683 Arm64ECRetDest =
8684 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8685 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8687 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8688 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8689 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8690 }
8691
8692 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8693}
8694
8695//===----------------------------------------------------------------------===//
8696// Other Lowering Code
8697//===----------------------------------------------------------------------===//
8698
8699SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8700 SelectionDAG &DAG,
8701 unsigned Flag) const {
8702 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8703 N->getOffset(), Flag);
8704}
8705
8706SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8707 SelectionDAG &DAG,
8708 unsigned Flag) const {
8709 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8710}
8711
8712SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8713 SelectionDAG &DAG,
8714 unsigned Flag) const {
8715 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8716 N->getOffset(), Flag);
8717}
8718
8719SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8720 SelectionDAG &DAG,
8721 unsigned Flag) const {
8722 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8723}
8724
8725SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8726 SelectionDAG &DAG,
8727 unsigned Flag) const {
8728 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8729}
8730
8731// (loadGOT sym)
8732template <class NodeTy>
8733SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8734 unsigned Flags) const {
8735 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8736 SDLoc DL(N);
8737 EVT Ty = getPointerTy(DAG.getDataLayout());
8738 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8739 // FIXME: Once remat is capable of dealing with instructions with register
8740 // operands, expand this into two nodes instead of using a wrapper node.
8741 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8742}
8743
8744// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8745template <class NodeTy>
8746SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8747 unsigned Flags) const {
8748 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8749 SDLoc DL(N);
8750 EVT Ty = getPointerTy(DAG.getDataLayout());
8751 const unsigned char MO_NC = AArch64II::MO_NC;
8752 return DAG.getNode(
8754 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8755 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8756 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8757 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8758}
8759
8760// (addlow (adrp %hi(sym)) %lo(sym))
8761template <class NodeTy>
8762SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8763 unsigned Flags) const {
8764 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8765 SDLoc DL(N);
8766 EVT Ty = getPointerTy(DAG.getDataLayout());
8767 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8768 SDValue Lo = getTargetNode(N, Ty, DAG,
8771 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8772}
8773
8774// (adr sym)
8775template <class NodeTy>
8776SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8777 unsigned Flags) const {
8778 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8779 SDLoc DL(N);
8780 EVT Ty = getPointerTy(DAG.getDataLayout());
8781 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8782 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8783}
8784
8785SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8786 SelectionDAG &DAG) const {
8787 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8788 const GlobalValue *GV = GN->getGlobal();
8789 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8790
8791 if (OpFlags != AArch64II::MO_NO_FLAG)
8792 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8793 "unexpected offset in global node");
8794
8795 // This also catches the large code model case for Darwin, and tiny code
8796 // model with got relocations.
8797 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8798 return getGOT(GN, DAG, OpFlags);
8799 }
8800
8804 Result = getAddrLarge(GN, DAG, OpFlags);
8805 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8806 Result = getAddrTiny(GN, DAG, OpFlags);
8807 } else {
8808 Result = getAddr(GN, DAG, OpFlags);
8809 }
8810 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8811 SDLoc DL(GN);
8813 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8815 return Result;
8816}
8817
8818/// Convert a TLS address reference into the correct sequence of loads
8819/// and calls to compute the variable's address (for Darwin, currently) and
8820/// return an SDValue containing the final node.
8821
8822/// Darwin only has one TLS scheme which must be capable of dealing with the
8823/// fully general situation, in the worst case. This means:
8824/// + "extern __thread" declaration.
8825/// + Defined in a possibly unknown dynamic library.
8826///
8827/// The general system is that each __thread variable has a [3 x i64] descriptor
8828/// which contains information used by the runtime to calculate the address. The
8829/// only part of this the compiler needs to know about is the first xword, which
8830/// contains a function pointer that must be called with the address of the
8831/// entire descriptor in "x0".
8832///
8833/// Since this descriptor may be in a different unit, in general even the
8834/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8835/// is:
8836/// adrp x0, _var@TLVPPAGE
8837/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8838/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8839/// ; the function pointer
8840/// blr x1 ; Uses descriptor address in x0
8841/// ; Address of _var is now in x0.
8842///
8843/// If the address of _var's descriptor *is* known to the linker, then it can
8844/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8845/// a slight efficiency gain.
8846SDValue
8847AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8848 SelectionDAG &DAG) const {
8849 assert(Subtarget->isTargetDarwin() &&
8850 "This function expects a Darwin target");
8851
8852 SDLoc DL(Op);
8853 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8854 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8855 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8856
8857 SDValue TLVPAddr =
8858 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8859 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8860
8861 // The first entry in the descriptor is a function pointer that we must call
8862 // to obtain the address of the variable.
8863 SDValue Chain = DAG.getEntryNode();
8864 SDValue FuncTLVGet = DAG.getLoad(
8865 PtrMemVT, DL, Chain, DescAddr,
8867 Align(PtrMemVT.getSizeInBits() / 8),
8869 Chain = FuncTLVGet.getValue(1);
8870
8871 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8872 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8873
8875 MFI.setAdjustsStack(true);
8876
8877 // TLS calls preserve all registers except those that absolutely must be
8878 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8879 // silly).
8880 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8881 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8882 if (Subtarget->hasCustomCallingConv())
8883 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8884
8885 // Finally, we can make the call. This is just a degenerate version of a
8886 // normal AArch64 call node: x0 takes the address of the descriptor, and
8887 // returns the address of the variable in this thread.
8888 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8889 Chain =
8890 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8891 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8892 DAG.getRegisterMask(Mask), Chain.getValue(1));
8893 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8894}
8895
8896/// Convert a thread-local variable reference into a sequence of instructions to
8897/// compute the variable's address for the local exec TLS model of ELF targets.
8898/// The sequence depends on the maximum TLS area size.
8899SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8900 SDValue ThreadBase,
8901 const SDLoc &DL,
8902 SelectionDAG &DAG) const {
8903 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8904 SDValue TPOff, Addr;
8905
8906 switch (DAG.getTarget().Options.TLSSize) {
8907 default:
8908 llvm_unreachable("Unexpected TLS size");
8909
8910 case 12: {
8911 // mrs x0, TPIDR_EL0
8912 // add x0, x0, :tprel_lo12:a
8914 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8915 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8916 Var,
8917 DAG.getTargetConstant(0, DL, MVT::i32)),
8918 0);
8919 }
8920
8921 case 24: {
8922 // mrs x0, TPIDR_EL0
8923 // add x0, x0, :tprel_hi12:a
8924 // add x0, x0, :tprel_lo12_nc:a
8925 SDValue HiVar = DAG.getTargetGlobalAddress(
8926 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8927 SDValue LoVar = DAG.getTargetGlobalAddress(
8928 GV, DL, PtrVT, 0,
8930 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8931 HiVar,
8932 DAG.getTargetConstant(0, DL, MVT::i32)),
8933 0);
8934 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8935 LoVar,
8936 DAG.getTargetConstant(0, DL, MVT::i32)),
8937 0);
8938 }
8939
8940 case 32: {
8941 // mrs x1, TPIDR_EL0
8942 // movz x0, #:tprel_g1:a
8943 // movk x0, #:tprel_g0_nc:a
8944 // add x0, x1, x0
8945 SDValue HiVar = DAG.getTargetGlobalAddress(
8946 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8947 SDValue LoVar = DAG.getTargetGlobalAddress(
8948 GV, DL, PtrVT, 0,
8950 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8951 DAG.getTargetConstant(16, DL, MVT::i32)),
8952 0);
8953 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8954 DAG.getTargetConstant(0, DL, MVT::i32)),
8955 0);
8956 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8957 }
8958
8959 case 48: {
8960 // mrs x1, TPIDR_EL0
8961 // movz x0, #:tprel_g2:a
8962 // movk x0, #:tprel_g1_nc:a
8963 // movk x0, #:tprel_g0_nc:a
8964 // add x0, x1, x0
8965 SDValue HiVar = DAG.getTargetGlobalAddress(
8966 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8967 SDValue MiVar = DAG.getTargetGlobalAddress(
8968 GV, DL, PtrVT, 0,
8970 SDValue LoVar = DAG.getTargetGlobalAddress(
8971 GV, DL, PtrVT, 0,
8973 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8974 DAG.getTargetConstant(32, DL, MVT::i32)),
8975 0);
8976 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8977 DAG.getTargetConstant(16, DL, MVT::i32)),
8978 0);
8979 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8980 DAG.getTargetConstant(0, DL, MVT::i32)),
8981 0);
8982 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8983 }
8984 }
8985}
8986
8987/// When accessing thread-local variables under either the general-dynamic or
8988/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8989/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8990/// is a function pointer to carry out the resolution.
8991///
8992/// The sequence is:
8993/// adrp x0, :tlsdesc:var
8994/// ldr x1, [x0, #:tlsdesc_lo12:var]
8995/// add x0, x0, #:tlsdesc_lo12:var
8996/// .tlsdesccall var
8997/// blr x1
8998/// (TPIDR_EL0 offset now in x0)
8999///
9000/// The above sequence must be produced unscheduled, to enable the linker to
9001/// optimize/relax this sequence.
9002/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9003/// above sequence, and expanded really late in the compilation flow, to ensure
9004/// the sequence is produced as per above.
9005SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9006 const SDLoc &DL,
9007 SelectionDAG &DAG) const {
9008 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9009
9010 SDValue Chain = DAG.getEntryNode();
9011 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9012
9013 Chain =
9014 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9015 SDValue Glue = Chain.getValue(1);
9016
9017 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9018}
9019
9020SDValue
9021AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9022 SelectionDAG &DAG) const {
9023 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9024
9025 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9026
9028
9030 if (Model == TLSModel::LocalDynamic)
9032 }
9033
9035 Model != TLSModel::LocalExec)
9036 report_fatal_error("ELF TLS only supported in small memory model or "
9037 "in local exec TLS model");
9038 // Different choices can be made for the maximum size of the TLS area for a
9039 // module. For the small address model, the default TLS size is 16MiB and the
9040 // maximum TLS size is 4GiB.
9041 // FIXME: add tiny and large code model support for TLS access models other
9042 // than local exec. We currently generate the same code as small for tiny,
9043 // which may be larger than needed.
9044
9045 SDValue TPOff;
9046 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9047 SDLoc DL(Op);
9048 const GlobalValue *GV = GA->getGlobal();
9049
9050 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9051
9052 if (Model == TLSModel::LocalExec) {
9053 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9054 } else if (Model == TLSModel::InitialExec) {
9055 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9056 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9057 } else if (Model == TLSModel::LocalDynamic) {
9058 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9059 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9060 // the beginning of the module's TLS region, followed by a DTPREL offset
9061 // calculation.
9062
9063 // These accesses will need deduplicating if there's more than one.
9064 AArch64FunctionInfo *MFI =
9067
9068 // The call needs a relocation too for linker relaxation. It doesn't make
9069 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9070 // the address.
9071 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9073
9074 // Now we can calculate the offset from TPIDR_EL0 to this module's
9075 // thread-local area.
9076 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9077
9078 // Now use :dtprel_whatever: operations to calculate this variable's offset
9079 // in its thread-storage area.
9080 SDValue HiVar = DAG.getTargetGlobalAddress(
9081 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9082 SDValue LoVar = DAG.getTargetGlobalAddress(
9083 GV, DL, MVT::i64, 0,
9085
9086 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9087 DAG.getTargetConstant(0, DL, MVT::i32)),
9088 0);
9089 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9090 DAG.getTargetConstant(0, DL, MVT::i32)),
9091 0);
9092 } else if (Model == TLSModel::GeneralDynamic) {
9093 // The call needs a relocation too for linker relaxation. It doesn't make
9094 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9095 // the address.
9096 SDValue SymAddr =
9097 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9098
9099 // Finally we can make a call to calculate the offset from tpidr_el0.
9100 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9101 } else
9102 llvm_unreachable("Unsupported ELF TLS access model");
9103
9104 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9105}
9106
9107SDValue
9108AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9109 SelectionDAG &DAG) const {
9110 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9111
9112 SDValue Chain = DAG.getEntryNode();
9113 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9114 SDLoc DL(Op);
9115
9116 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9117
9118 // Load the ThreadLocalStoragePointer from the TEB
9119 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9120 SDValue TLSArray =
9121 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9122 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9123 Chain = TLSArray.getValue(1);
9124
9125 // Load the TLS index from the C runtime;
9126 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9127 // This also does the same as LOADgot, but using a generic i32 load,
9128 // while LOADgot only loads i64.
9129 SDValue TLSIndexHi =
9130 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9131 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9132 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9133 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9134 SDValue TLSIndex =
9135 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9136 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9137 Chain = TLSIndex.getValue(1);
9138
9139 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9140 // offset into the TLSArray.
9141 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9142 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9143 DAG.getConstant(3, DL, PtrVT));
9144 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9145 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9147 Chain = TLS.getValue(1);
9148
9149 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9150 const GlobalValue *GV = GA->getGlobal();
9151 SDValue TGAHi = DAG.getTargetGlobalAddress(
9152 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9153 SDValue TGALo = DAG.getTargetGlobalAddress(
9154 GV, DL, PtrVT, 0,
9156
9157 // Add the offset from the start of the .tls section (section base).
9158 SDValue Addr =
9159 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9160 DAG.getTargetConstant(0, DL, MVT::i32)),
9161 0);
9162 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9163 return Addr;
9164}
9165
9166SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9167 SelectionDAG &DAG) const {
9168 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9169 if (DAG.getTarget().useEmulatedTLS())
9170 return LowerToTLSEmulatedModel(GA, DAG);
9171
9172 if (Subtarget->isTargetDarwin())
9173 return LowerDarwinGlobalTLSAddress(Op, DAG);
9174 if (Subtarget->isTargetELF())
9175 return LowerELFGlobalTLSAddress(Op, DAG);
9176 if (Subtarget->isTargetWindows())
9177 return LowerWindowsGlobalTLSAddress(Op, DAG);
9178
9179 llvm_unreachable("Unexpected platform trying to use TLS");
9180}
9181
9182// Looks through \param Val to determine the bit that can be used to
9183// check the sign of the value. It returns the unextended value and
9184// the sign bit position.
9185std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9186 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9187 return {Val.getOperand(0),
9188 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9189 1};
9190
9191 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9192 return {Val.getOperand(0),
9193 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9194
9195 return {Val, Val.getValueSizeInBits() - 1};
9196}
9197
9198SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9199 SDValue Chain = Op.getOperand(0);
9200 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9201 SDValue LHS = Op.getOperand(2);
9202 SDValue RHS = Op.getOperand(3);
9203 SDValue Dest = Op.getOperand(4);
9204 SDLoc dl(Op);
9205
9207 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9208 // will not be produced, as they are conditional branch instructions that do
9209 // not set flags.
9210 bool ProduceNonFlagSettingCondBr =
9211 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9212
9213 // Handle f128 first, since lowering it will result in comparing the return
9214 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9215 // is expecting to deal with.
9216 if (LHS.getValueType() == MVT::f128) {
9217 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9218
9219 // If softenSetCCOperands returned a scalar, we need to compare the result
9220 // against zero to select between true and false values.
9221 if (!RHS.getNode()) {
9222 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9223 CC = ISD::SETNE;
9224 }
9225 }
9226
9227 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9228 // instruction.
9229 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9230 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9231 // Only lower legal XALUO ops.
9232 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9233 return SDValue();
9234
9235 // The actual operation with overflow check.
9237 SDValue Value, Overflow;
9238 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9239
9240 if (CC == ISD::SETNE)
9241 OFCC = getInvertedCondCode(OFCC);
9242 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9243
9244 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9245 Overflow);
9246 }
9247
9248 if (LHS.getValueType().isInteger()) {
9249 assert((LHS.getValueType() == RHS.getValueType()) &&
9250 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9251
9252 // If the RHS of the comparison is zero, we can potentially fold this
9253 // to a specialized branch.
9254 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9255 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9256 if (CC == ISD::SETEQ) {
9257 // See if we can use a TBZ to fold in an AND as well.
9258 // TBZ has a smaller branch displacement than CBZ. If the offset is
9259 // out of bounds, a late MI-layer pass rewrites branches.
9260 // 403.gcc is an example that hits this case.
9261 if (LHS.getOpcode() == ISD::AND &&
9262 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9263 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9264 SDValue Test = LHS.getOperand(0);
9265 uint64_t Mask = LHS.getConstantOperandVal(1);
9266 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9267 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9268 Dest);
9269 }
9270
9271 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9272 } else if (CC == ISD::SETNE) {
9273 // See if we can use a TBZ to fold in an AND as well.
9274 // TBZ has a smaller branch displacement than CBZ. If the offset is
9275 // out of bounds, a late MI-layer pass rewrites branches.
9276 // 403.gcc is an example that hits this case.
9277 if (LHS.getOpcode() == ISD::AND &&
9278 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9279 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9280 SDValue Test = LHS.getOperand(0);
9281 uint64_t Mask = LHS.getConstantOperandVal(1);
9282 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9283 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9284 Dest);
9285 }
9286
9287 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9288 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9289 // Don't combine AND since emitComparison converts the AND to an ANDS
9290 // (a.k.a. TST) and the test in the test bit and branch instruction
9291 // becomes redundant. This would also increase register pressure.
9292 uint64_t SignBitPos;
9293 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9294 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9295 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9296 }
9297 }
9298 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9299 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9300 // Don't combine AND since emitComparison converts the AND to an ANDS
9301 // (a.k.a. TST) and the test in the test bit and branch instruction
9302 // becomes redundant. This would also increase register pressure.
9303 uint64_t SignBitPos;
9304 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9305 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9306 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9307 }
9308
9309 SDValue CCVal;
9310 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9311 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9312 Cmp);
9313 }
9314
9315 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9316 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9317
9318 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9319 // clean. Some of them require two branches to implement.
9320 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9321 AArch64CC::CondCode CC1, CC2;
9322 changeFPCCToAArch64CC(CC, CC1, CC2);
9323 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9324 SDValue BR1 =
9325 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9326 if (CC2 != AArch64CC::AL) {
9327 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9328 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9329 Cmp);
9330 }
9331
9332 return BR1;
9333}
9334
9335SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9336 SelectionDAG &DAG) const {
9337 if (!Subtarget->hasNEON())
9338 return SDValue();
9339
9340 EVT VT = Op.getValueType();
9341 EVT IntVT = VT.changeTypeToInteger();
9342 SDLoc DL(Op);
9343
9344 SDValue In1 = Op.getOperand(0);
9345 SDValue In2 = Op.getOperand(1);
9346 EVT SrcVT = In2.getValueType();
9347
9348 if (!SrcVT.bitsEq(VT))
9349 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9350
9351 if (VT.isScalableVector())
9352 IntVT =
9354
9355 if (VT.isFixedLengthVector() &&
9356 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9357 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9358
9359 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9360 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9361
9362 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9363 return convertFromScalableVector(DAG, VT, Res);
9364 }
9365
9366 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9367 if (VT.isScalableVector())
9368 return getSVESafeBitCast(VT, Op, DAG);
9369
9370 return DAG.getBitcast(VT, Op);
9371 };
9372
9373 SDValue VecVal1, VecVal2;
9374 EVT VecVT;
9375 auto SetVecVal = [&](int Idx = -1) {
9376 if (!VT.isVector()) {
9377 VecVal1 =
9378 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9379 VecVal2 =
9380 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9381 } else {
9382 VecVal1 = BitCast(VecVT, In1, DAG);
9383 VecVal2 = BitCast(VecVT, In2, DAG);
9384 }
9385 };
9386 if (VT.isVector()) {
9387 VecVT = IntVT;
9388 SetVecVal();
9389 } else if (VT == MVT::f64) {
9390 VecVT = MVT::v2i64;
9391 SetVecVal(AArch64::dsub);
9392 } else if (VT == MVT::f32) {
9393 VecVT = MVT::v4i32;
9394 SetVecVal(AArch64::ssub);
9395 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9396 VecVT = MVT::v8i16;
9397 SetVecVal(AArch64::hsub);
9398 } else {
9399 llvm_unreachable("Invalid type for copysign!");
9400 }
9401
9402 unsigned BitWidth = In1.getScalarValueSizeInBits();
9403 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9404
9405 // We want to materialize a mask with every bit but the high bit set, but the
9406 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9407 // 64-bit elements. Instead, materialize all bits set and then negate that.
9408 if (VT == MVT::f64 || VT == MVT::v2f64) {
9409 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9410 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9411 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9412 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9413 }
9414
9415 SDValue BSP =
9416 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9417 if (VT == MVT::f16 || VT == MVT::bf16)
9418 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9419 if (VT == MVT::f32)
9420 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9421 if (VT == MVT::f64)
9422 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9423
9424 return BitCast(VT, BSP, DAG);
9425}
9426
9427SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9428 SelectionDAG &DAG) const {
9430 Attribute::NoImplicitFloat))
9431 return SDValue();
9432
9433 if (!Subtarget->hasNEON())
9434 return SDValue();
9435
9436 bool IsParity = Op.getOpcode() == ISD::PARITY;
9437 SDValue Val = Op.getOperand(0);
9438 SDLoc DL(Op);
9439 EVT VT = Op.getValueType();
9440
9441 // for i32, general parity function using EORs is more efficient compared to
9442 // using floating point
9443 if (VT == MVT::i32 && IsParity)
9444 return SDValue();
9445
9446 // If there is no CNT instruction available, GPR popcount can
9447 // be more efficiently lowered to the following sequence that uses
9448 // AdvSIMD registers/instructions as long as the copies to/from
9449 // the AdvSIMD registers are cheap.
9450 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9451 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9452 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9453 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9454 if (VT == MVT::i32 || VT == MVT::i64) {
9455 if (VT == MVT::i32)
9456 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9457 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9458
9459 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9460 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9461 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9462 DAG.getConstant(0, DL, MVT::i64));
9463
9464 if (IsParity)
9465 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9466 DAG.getConstant(1, DL, MVT::i32));
9467
9468 if (VT == MVT::i64)
9469 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9470 return UaddLV;
9471 } else if (VT == MVT::i128) {
9472 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9473
9474 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9475 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9476 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9477 DAG.getConstant(0, DL, MVT::i64));
9478
9479 if (IsParity)
9480 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9481 DAG.getConstant(1, DL, MVT::i32));
9482
9483 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9484 }
9485
9486 assert(!IsParity && "ISD::PARITY of vector types not supported");
9487
9488 if (VT.isScalableVector() ||
9490 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9491
9492 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9493 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9494 "Unexpected type for custom ctpop lowering");
9495
9496 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9497 Val = DAG.getBitcast(VT8Bit, Val);
9498 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9499
9500 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9501 unsigned EltSize = 8;
9502 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9503 while (EltSize != VT.getScalarSizeInBits()) {
9504 EltSize *= 2;
9505 NumElts /= 2;
9506 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9507 Val = DAG.getNode(
9508 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9509 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9510 }
9511
9512 return Val;
9513}
9514
9515SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9516 EVT VT = Op.getValueType();
9517 assert(VT.isScalableVector() ||
9519 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9520
9521 SDLoc DL(Op);
9522 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9523 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9524}
9525
9526SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9527 SelectionDAG &DAG) const {
9528
9529 EVT VT = Op.getValueType();
9530 SDLoc DL(Op);
9531 unsigned Opcode = Op.getOpcode();
9533 switch (Opcode) {
9534 default:
9535 llvm_unreachable("Wrong instruction");
9536 case ISD::SMAX:
9537 CC = ISD::SETGT;
9538 break;
9539 case ISD::SMIN:
9540 CC = ISD::SETLT;
9541 break;
9542 case ISD::UMAX:
9543 CC = ISD::SETUGT;
9544 break;
9545 case ISD::UMIN:
9546 CC = ISD::SETULT;
9547 break;
9548 }
9549
9550 if (VT.isScalableVector() ||
9552 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9553 switch (Opcode) {
9554 default:
9555 llvm_unreachable("Wrong instruction");
9556 case ISD::SMAX:
9557 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9558 case ISD::SMIN:
9559 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9560 case ISD::UMAX:
9561 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9562 case ISD::UMIN:
9563 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9564 }
9565 }
9566
9567 SDValue Op0 = Op.getOperand(0);
9568 SDValue Op1 = Op.getOperand(1);
9569 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9570 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9571}
9572
9573SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9574 SelectionDAG &DAG) const {
9575 EVT VT = Op.getValueType();
9576
9577 if (VT.isScalableVector() ||
9579 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9580 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9581
9582 SDLoc DL(Op);
9583 SDValue REVB;
9584 MVT VST;
9585
9586 switch (VT.getSimpleVT().SimpleTy) {
9587 default:
9588 llvm_unreachable("Invalid type for bitreverse!");
9589
9590 case MVT::v2i32: {
9591 VST = MVT::v8i8;
9592 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9593
9594 break;
9595 }
9596
9597 case MVT::v4i32: {
9598 VST = MVT::v16i8;
9599 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9600
9601 break;
9602 }
9603
9604 case MVT::v1i64: {
9605 VST = MVT::v8i8;
9606 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9607
9608 break;
9609 }
9610
9611 case MVT::v2i64: {
9612 VST = MVT::v16i8;
9613 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9614
9615 break;
9616 }
9617 }
9618
9619 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9620 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9621}
9622
9623// Check whether the continuous comparison sequence.
9624static bool
9625isOrXorChain(SDValue N, unsigned &Num,
9626 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9627 if (Num == MaxXors)
9628 return false;
9629
9630 // Skip the one-use zext
9631 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9632 N = N->getOperand(0);
9633
9634 // The leaf node must be XOR
9635 if (N->getOpcode() == ISD::XOR) {
9636 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9637 Num++;
9638 return true;
9639 }
9640
9641 // All the non-leaf nodes must be OR.
9642 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9643 return false;
9644
9645 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9646 isOrXorChain(N->getOperand(1), Num, WorkList))
9647 return true;
9648 return false;
9649}
9650
9651// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9653 SDValue LHS = N->getOperand(0);
9654 SDValue RHS = N->getOperand(1);
9655 SDLoc DL(N);
9656 EVT VT = N->getValueType(0);
9658
9659 // Only handle integer compares.
9660 if (N->getOpcode() != ISD::SETCC)
9661 return SDValue();
9662
9663 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9664 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9665 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9666 unsigned NumXors = 0;
9667 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9668 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9669 isOrXorChain(LHS, NumXors, WorkList)) {
9670 SDValue XOR0, XOR1;
9671 std::tie(XOR0, XOR1) = WorkList[0];
9672 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9673 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9674 for (unsigned I = 1; I < WorkList.size(); I++) {
9675 std::tie(XOR0, XOR1) = WorkList[I];
9676 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9677 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9678 }
9679
9680 // Exit early by inverting the condition, which help reduce indentations.
9681 return Cmp;
9682 }
9683
9684 return SDValue();
9685}
9686
9687SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9688
9689 if (Op.getValueType().isVector())
9690 return LowerVSETCC(Op, DAG);
9691
9692 bool IsStrict = Op->isStrictFPOpcode();
9693 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9694 unsigned OpNo = IsStrict ? 1 : 0;
9695 SDValue Chain;
9696 if (IsStrict)
9697 Chain = Op.getOperand(0);
9698 SDValue LHS = Op.getOperand(OpNo + 0);
9699 SDValue RHS = Op.getOperand(OpNo + 1);
9700 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9701 SDLoc dl(Op);
9702
9703 // We chose ZeroOrOneBooleanContents, so use zero and one.
9704 EVT VT = Op.getValueType();
9705 SDValue TVal = DAG.getConstant(1, dl, VT);
9706 SDValue FVal = DAG.getConstant(0, dl, VT);
9707
9708 // Handle f128 first, since one possible outcome is a normal integer
9709 // comparison which gets picked up by the next if statement.
9710 if (LHS.getValueType() == MVT::f128) {
9711 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9712 IsSignaling);
9713
9714 // If softenSetCCOperands returned a scalar, use it.
9715 if (!RHS.getNode()) {
9716 assert(LHS.getValueType() == Op.getValueType() &&
9717 "Unexpected setcc expansion!");
9718 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9719 }
9720 }
9721
9722 if (LHS.getValueType().isInteger()) {
9723 SDValue CCVal;
9725 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9726
9727 // Note that we inverted the condition above, so we reverse the order of
9728 // the true and false operands here. This will allow the setcc to be
9729 // matched to a single CSINC instruction.
9730 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9731 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9732 }
9733
9734 // Now we know we're dealing with FP values.
9735 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9736 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9737
9738 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9739 // and do the comparison.
9740 SDValue Cmp;
9741 if (IsStrict)
9742 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9743 else
9744 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9745
9746 AArch64CC::CondCode CC1, CC2;
9747 changeFPCCToAArch64CC(CC, CC1, CC2);
9748 SDValue Res;
9749 if (CC2 == AArch64CC::AL) {
9750 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9751 CC2);
9752 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9753
9754 // Note that we inverted the condition above, so we reverse the order of
9755 // the true and false operands here. This will allow the setcc to be
9756 // matched to a single CSINC instruction.
9757 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9758 } else {
9759 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9760 // totally clean. Some of them require two CSELs to implement. As is in
9761 // this case, we emit the first CSEL and then emit a second using the output
9762 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9763
9764 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9765 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9766 SDValue CS1 =
9767 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9768
9769 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9770 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9771 }
9772 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9773}
9774
9775SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9776 SelectionDAG &DAG) const {
9777
9778 SDValue LHS = Op.getOperand(0);
9779 SDValue RHS = Op.getOperand(1);
9780 EVT VT = LHS.getValueType();
9781 if (VT != MVT::i32 && VT != MVT::i64)
9782 return SDValue();
9783
9784 SDLoc DL(Op);
9785 SDValue Carry = Op.getOperand(2);
9786 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9787 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9788 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9789 LHS, RHS, InvCarry);
9790
9791 EVT OpVT = Op.getValueType();
9792 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9793 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9794
9795 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9797 SDValue CCVal =
9798 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9799 // Inputs are swapped because the condition is inverted. This will allow
9800 // matching with a single CSINC instruction.
9801 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9802 Cmp.getValue(1));
9803}
9804
9805SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9806 SDValue RHS, SDValue TVal,
9807 SDValue FVal, const SDLoc &dl,
9808 SelectionDAG &DAG) const {
9809 // Handle f128 first, because it will result in a comparison of some RTLIB
9810 // call result against zero.
9811 if (LHS.getValueType() == MVT::f128) {
9812 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9813
9814 // If softenSetCCOperands returned a scalar, we need to compare the result
9815 // against zero to select between true and false values.
9816 if (!RHS.getNode()) {
9817 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9818 CC = ISD::SETNE;
9819 }
9820 }
9821
9822 // Also handle f16, for which we need to do a f32 comparison.
9823 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9824 LHS.getValueType() == MVT::bf16) {
9825 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9826 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9827 }
9828
9829 // Next, handle integers.
9830 if (LHS.getValueType().isInteger()) {
9831 assert((LHS.getValueType() == RHS.getValueType()) &&
9832 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9833
9834 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9835 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9836 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9837 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9838 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9839 // supported types.
9840 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9841 CTVal->isOne() && CFVal->isAllOnes() &&
9842 LHS.getValueType() == TVal.getValueType()) {
9843 EVT VT = LHS.getValueType();
9844 SDValue Shift =
9845 DAG.getNode(ISD::SRA, dl, VT, LHS,
9846 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9847 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9848 }
9849
9850 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9851 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9852 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9853 // Both require less instructions than compare and conditional select.
9854 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9855 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9856 LHS.getValueType() == RHS.getValueType()) {
9857 EVT VT = LHS.getValueType();
9858 SDValue Shift =
9859 DAG.getNode(ISD::SRA, dl, VT, LHS,
9860 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9861
9862 if (CC == ISD::SETGT)
9863 Shift = DAG.getNOT(dl, Shift, VT);
9864
9865 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9866 }
9867
9868 unsigned Opcode = AArch64ISD::CSEL;
9869
9870 // If both the TVal and the FVal are constants, see if we can swap them in
9871 // order to for a CSINV or CSINC out of them.
9872 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9873 std::swap(TVal, FVal);
9874 std::swap(CTVal, CFVal);
9875 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9876 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9877 std::swap(TVal, FVal);
9878 std::swap(CTVal, CFVal);
9879 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9880 } else if (TVal.getOpcode() == ISD::XOR) {
9881 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9882 // with a CSINV rather than a CSEL.
9883 if (isAllOnesConstant(TVal.getOperand(1))) {
9884 std::swap(TVal, FVal);
9885 std::swap(CTVal, CFVal);
9886 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9887 }
9888 } else if (TVal.getOpcode() == ISD::SUB) {
9889 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9890 // that we can match with a CSNEG rather than a CSEL.
9891 if (isNullConstant(TVal.getOperand(0))) {
9892 std::swap(TVal, FVal);
9893 std::swap(CTVal, CFVal);
9894 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9895 }
9896 } else if (CTVal && CFVal) {
9897 const int64_t TrueVal = CTVal->getSExtValue();
9898 const int64_t FalseVal = CFVal->getSExtValue();
9899 bool Swap = false;
9900
9901 // If both TVal and FVal are constants, see if FVal is the
9902 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9903 // instead of a CSEL in that case.
9904 if (TrueVal == ~FalseVal) {
9905 Opcode = AArch64ISD::CSINV;
9906 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9907 TrueVal == -FalseVal) {
9908 Opcode = AArch64ISD::CSNEG;
9909 } else if (TVal.getValueType() == MVT::i32) {
9910 // If our operands are only 32-bit wide, make sure we use 32-bit
9911 // arithmetic for the check whether we can use CSINC. This ensures that
9912 // the addition in the check will wrap around properly in case there is
9913 // an overflow (which would not be the case if we do the check with
9914 // 64-bit arithmetic).
9915 const uint32_t TrueVal32 = CTVal->getZExtValue();
9916 const uint32_t FalseVal32 = CFVal->getZExtValue();
9917
9918 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9919 Opcode = AArch64ISD::CSINC;
9920
9921 if (TrueVal32 > FalseVal32) {
9922 Swap = true;
9923 }
9924 }
9925 } else {
9926 // 64-bit check whether we can use CSINC.
9927 const uint64_t TrueVal64 = TrueVal;
9928 const uint64_t FalseVal64 = FalseVal;
9929
9930 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9931 Opcode = AArch64ISD::CSINC;
9932
9933 if (TrueVal > FalseVal) {
9934 Swap = true;
9935 }
9936 }
9937 }
9938
9939 // Swap TVal and FVal if necessary.
9940 if (Swap) {
9941 std::swap(TVal, FVal);
9942 std::swap(CTVal, CFVal);
9943 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9944 }
9945
9946 if (Opcode != AArch64ISD::CSEL) {
9947 // Drop FVal since we can get its value by simply inverting/negating
9948 // TVal.
9949 FVal = TVal;
9950 }
9951 }
9952
9953 // Avoid materializing a constant when possible by reusing a known value in
9954 // a register. However, don't perform this optimization if the known value
9955 // is one, zero or negative one in the case of a CSEL. We can always
9956 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9957 // FVal, respectively.
9958 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9959 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9960 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9962 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9963 // "a != C ? x : a" to avoid materializing C.
9964 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9965 TVal = LHS;
9966 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9967 FVal = LHS;
9968 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9969 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9970 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9971 // avoid materializing C.
9973 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9974 Opcode = AArch64ISD::CSINV;
9975 TVal = LHS;
9976 FVal = DAG.getConstant(0, dl, FVal.getValueType());
9977 }
9978 }
9979
9980 SDValue CCVal;
9981 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9982 EVT VT = TVal.getValueType();
9983 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
9984 }
9985
9986 // Now we know we're dealing with FP values.
9987 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9988 LHS.getValueType() == MVT::f64);
9989 assert(LHS.getValueType() == RHS.getValueType());
9990 EVT VT = TVal.getValueType();
9991 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9992
9993 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9994 // clean. Some of them require two CSELs to implement.
9995 AArch64CC::CondCode CC1, CC2;
9996 changeFPCCToAArch64CC(CC, CC1, CC2);
9997
9998 if (DAG.getTarget().Options.UnsafeFPMath) {
9999 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10000 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10001 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10002 if (RHSVal && RHSVal->isZero()) {
10003 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10004 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10005
10006 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10007 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10008 TVal = LHS;
10009 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10010 CFVal && CFVal->isZero() &&
10011 FVal.getValueType() == LHS.getValueType())
10012 FVal = LHS;
10013 }
10014 }
10015
10016 // Emit first, and possibly only, CSEL.
10017 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10018 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10019
10020 // If we need a second CSEL, emit it, using the output of the first as the
10021 // RHS. We're effectively OR'ing the two CC's together.
10022 if (CC2 != AArch64CC::AL) {
10023 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10024 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10025 }
10026
10027 // Otherwise, return the output of the first CSEL.
10028 return CS1;
10029}
10030
10031SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10032 SelectionDAG &DAG) const {
10033 EVT Ty = Op.getValueType();
10034 auto Idx = Op.getConstantOperandAPInt(2);
10035 int64_t IdxVal = Idx.getSExtValue();
10036 assert(Ty.isScalableVector() &&
10037 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10038
10039 // We can use the splice instruction for certain index values where we are
10040 // able to efficiently generate the correct predicate. The index will be
10041 // inverted and used directly as the input to the ptrue instruction, i.e.
10042 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10043 // splice predicate. However, we can only do this if we can guarantee that
10044 // there are enough elements in the vector, hence we check the index <= min
10045 // number of elements.
10046 std::optional<unsigned> PredPattern;
10047 if (Ty.isScalableVector() && IdxVal < 0 &&
10048 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10049 std::nullopt) {
10050 SDLoc DL(Op);
10051
10052 // Create a predicate where all but the last -IdxVal elements are false.
10053 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10054 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10055 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10056
10057 // Now splice the two inputs together using the predicate.
10058 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10059 Op.getOperand(1));
10060 }
10061
10062 // This will select to an EXT instruction, which has a maximum immediate
10063 // value of 255, hence 2048-bits is the maximum value we can lower.
10064 if (IdxVal >= 0 &&
10065 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
10066 return Op;
10067
10068 return SDValue();
10069}
10070
10071SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10072 SelectionDAG &DAG) const {
10073 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10074 SDValue LHS = Op.getOperand(0);
10075 SDValue RHS = Op.getOperand(1);
10076 SDValue TVal = Op.getOperand(2);
10077 SDValue FVal = Op.getOperand(3);
10078 SDLoc DL(Op);
10079 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10080}
10081
10082SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10083 SelectionDAG &DAG) const {
10084 SDValue CCVal = Op->getOperand(0);
10085 SDValue TVal = Op->getOperand(1);
10086 SDValue FVal = Op->getOperand(2);
10087 SDLoc DL(Op);
10088
10089 EVT Ty = Op.getValueType();
10090 if (Ty == MVT::aarch64svcount) {
10091 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10092 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10093 SDValue Sel =
10094 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10095 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10096 }
10097
10098 if (Ty.isScalableVector()) {
10099 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10100 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10101 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10102 }
10103
10104 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10105 // FIXME: Ideally this would be the same as above using i1 types, however
10106 // for the moment we can't deal with fixed i1 vector types properly, so
10107 // instead extend the predicate to a result type sized integer vector.
10108 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10109 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10110 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10111 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10112 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10113 }
10114
10115 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10116 // instruction.
10117 if (ISD::isOverflowIntrOpRes(CCVal)) {
10118 // Only lower legal XALUO ops.
10119 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10120 return SDValue();
10121
10123 SDValue Value, Overflow;
10124 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10125 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10126
10127 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10128 CCVal, Overflow);
10129 }
10130
10131 // Lower it the same way as we would lower a SELECT_CC node.
10133 SDValue LHS, RHS;
10134 if (CCVal.getOpcode() == ISD::SETCC) {
10135 LHS = CCVal.getOperand(0);
10136 RHS = CCVal.getOperand(1);
10137 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10138 } else {
10139 LHS = CCVal;
10140 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10141 CC = ISD::SETNE;
10142 }
10143
10144 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10145 // order to use FCSELSrrr
10146 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10147 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10148 DAG.getUNDEF(MVT::f32), TVal);
10149 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10150 DAG.getUNDEF(MVT::f32), FVal);
10151 }
10152
10153 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10154
10155 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10156 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10157 }
10158
10159 return Res;
10160}
10161
10162SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10163 SelectionDAG &DAG) const {
10164 // Jump table entries as PC relative offsets. No additional tweaking
10165 // is necessary here. Just get the address of the jump table.
10166 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10167
10170 !Subtarget->isTargetMachO())
10171 return getAddrLarge(JT, DAG);
10172 if (CM == CodeModel::Tiny)
10173 return getAddrTiny(JT, DAG);
10174 return getAddr(JT, DAG);
10175}
10176
10177SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10178 SelectionDAG &DAG) const {
10179 // Jump table entries as PC relative offsets. No additional tweaking
10180 // is necessary here. Just get the address of the jump table.
10181 SDLoc DL(Op);
10182 SDValue JT = Op.getOperand(1);
10183 SDValue Entry = Op.getOperand(2);
10184 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10185
10186 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10187 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10188
10189 SDNode *Dest =
10190 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10191 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10192 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10193 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10194}
10195
10196SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10197 SelectionDAG &DAG) const {
10198 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10200 if (CM == CodeModel::Large) {
10201 // Use the GOT for the large code model on iOS.
10202 if (Subtarget->isTargetMachO()) {
10203 return getGOT(CP, DAG);
10204 }
10206 return getAddrLarge(CP, DAG);
10207 } else if (CM == CodeModel::Tiny) {
10208 return getAddrTiny(CP, DAG);
10209 }
10210 return getAddr(CP, DAG);
10211}
10212
10213SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10214 SelectionDAG &DAG) const {
10215 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10217 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10219 return getAddrLarge(BA, DAG);
10220 } else if (CM == CodeModel::Tiny) {
10221 return getAddrTiny(BA, DAG);
10222 }
10223 return getAddr(BA, DAG);
10224}
10225
10226SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10227 SelectionDAG &DAG) const {
10228 AArch64FunctionInfo *FuncInfo =
10230
10231 SDLoc DL(Op);
10232 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10234 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10235 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10236 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10237 MachinePointerInfo(SV));
10238}
10239
10240SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10241 SelectionDAG &DAG) const {
10244
10245 SDLoc DL(Op);
10246 SDValue FR;
10247 if (Subtarget->isWindowsArm64EC()) {
10248 // With the Arm64EC ABI, we compute the address of the varargs save area
10249 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10250 // but calls from an entry thunk can pass in a different address.
10251 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10252 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10254 if (FuncInfo->getVarArgsGPRSize() > 0)
10255 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10256 else
10257 StackOffset = FuncInfo->getVarArgsStackOffset();
10258 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10259 DAG.getConstant(StackOffset, DL, MVT::i64));
10260 } else {
10261 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10262 ? FuncInfo->getVarArgsGPRIndex()
10263 : FuncInfo->getVarArgsStackIndex(),
10265 }
10266 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10267 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10268 MachinePointerInfo(SV));
10269}
10270
10271SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10272 SelectionDAG &DAG) const {
10273 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10274 // Standard, section B.3.
10277 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10278 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10279 auto PtrVT = getPointerTy(DAG.getDataLayout());
10280 SDLoc DL(Op);
10281
10282 SDValue Chain = Op.getOperand(0);
10283 SDValue VAList = Op.getOperand(1);
10284 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10286
10287 // void *__stack at offset 0
10288 unsigned Offset = 0;
10289 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10290 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10291 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10292 MachinePointerInfo(SV), Align(PtrSize)));
10293
10294 // void *__gr_top at offset 8 (4 on ILP32)
10295 Offset += PtrSize;
10296 int GPRSize = FuncInfo->getVarArgsGPRSize();
10297 if (GPRSize > 0) {
10298 SDValue GRTop, GRTopAddr;
10299
10300 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10301 DAG.getConstant(Offset, DL, PtrVT));
10302
10303 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10304 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10305 DAG.getConstant(GPRSize, DL, PtrVT));
10306 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10307
10308 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10310 Align(PtrSize)));
10311 }
10312
10313 // void *__vr_top at offset 16 (8 on ILP32)
10314 Offset += PtrSize;
10315 int FPRSize = FuncInfo->getVarArgsFPRSize();
10316 if (FPRSize > 0) {
10317 SDValue VRTop, VRTopAddr;
10318 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10319 DAG.getConstant(Offset, DL, PtrVT));
10320
10321 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10322 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10323 DAG.getConstant(FPRSize, DL, PtrVT));
10324 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10325
10326 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10328 Align(PtrSize)));
10329 }
10330
10331 // int __gr_offs at offset 24 (12 on ILP32)
10332 Offset += PtrSize;
10333 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10334 DAG.getConstant(Offset, DL, PtrVT));
10335 MemOps.push_back(
10336 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10337 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10338
10339 // int __vr_offs at offset 28 (16 on ILP32)
10340 Offset += 4;
10341 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10342 DAG.getConstant(Offset, DL, PtrVT));
10343 MemOps.push_back(
10344 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10345 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10346
10347 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10348}
10349
10350SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10351 SelectionDAG &DAG) const {
10353
10354 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10355 return LowerWin64_VASTART(Op, DAG);
10356 else if (Subtarget->isTargetDarwin())
10357 return LowerDarwin_VASTART(Op, DAG);
10358 else
10359 return LowerAAPCS_VASTART(Op, DAG);
10360}
10361
10362SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10363 SelectionDAG &DAG) const {
10364 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10365 // pointer.
10366 SDLoc DL(Op);
10367 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10368 unsigned VaListSize =
10369 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10370 ? PtrSize
10371 : Subtarget->isTargetILP32() ? 20 : 32;
10372 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10373 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10374
10375 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10376 DAG.getConstant(VaListSize, DL, MVT::i32),
10377 Align(PtrSize), false, false, false,
10378 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10379}
10380
10381SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10382 assert(Subtarget->isTargetDarwin() &&
10383 "automatic va_arg instruction only works on Darwin");
10384
10385 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10386 EVT VT = Op.getValueType();
10387 SDLoc DL(Op);
10388 SDValue Chain = Op.getOperand(0);
10389 SDValue Addr = Op.getOperand(1);
10390 MaybeAlign Align(Op.getConstantOperandVal(3));
10391 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10392 auto PtrVT = getPointerTy(DAG.getDataLayout());
10393 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10394 SDValue VAList =
10395 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10396 Chain = VAList.getValue(1);
10397 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10398
10399 if (VT.isScalableVector())
10400 report_fatal_error("Passing SVE types to variadic functions is "
10401 "currently not supported");
10402
10403 if (Align && *Align > MinSlotSize) {
10404 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10405 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10406 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10407 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10408 }
10409
10410 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10411 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10412
10413 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10414 // up to 64 bits. At the very least, we have to increase the striding of the
10415 // vaargs list to match this, and for FP values we need to introduce
10416 // FP_ROUND nodes as well.
10417 if (VT.isInteger() && !VT.isVector())
10418 ArgSize = std::max(ArgSize, MinSlotSize);
10419 bool NeedFPTrunc = false;
10420 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10421 ArgSize = 8;
10422 NeedFPTrunc = true;
10423 }
10424
10425 // Increment the pointer, VAList, to the next vaarg
10426 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10427 DAG.getConstant(ArgSize, DL, PtrVT));
10428 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10429
10430 // Store the incremented VAList to the legalized pointer
10431 SDValue APStore =
10432 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10433
10434 // Load the actual argument out of the pointer VAList
10435 if (NeedFPTrunc) {
10436 // Load the value as an f64.
10437 SDValue WideFP =
10438 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10439 // Round the value down to an f32.
10440 SDValue NarrowFP =
10441 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10442 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10443 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10444 // Merge the rounded value with the chain output of the load.
10445 return DAG.getMergeValues(Ops, DL);
10446 }
10447
10448 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10449}
10450
10451SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10452 SelectionDAG &DAG) const {
10454 MFI.setFrameAddressIsTaken(true);
10455
10456 EVT VT = Op.getValueType();
10457 SDLoc DL(Op);
10458 unsigned Depth = Op.getConstantOperandVal(0);
10459 SDValue FrameAddr =
10460 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10461 while (Depth--)
10462 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10464
10465 if (Subtarget->isTargetILP32())
10466 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10467 DAG.getValueType(VT));
10468
10469 return FrameAddr;
10470}
10471
10472SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10473 SelectionDAG &DAG) const {
10475
10476 EVT VT = getPointerTy(DAG.getDataLayout());
10477 SDLoc DL(Op);
10478 int FI = MFI.CreateFixedObject(4, 0, false);
10479 return DAG.getFrameIndex(FI, VT);
10480}
10481
10482#define GET_REGISTER_MATCHER
10483#include "AArch64GenAsmMatcher.inc"
10484
10485// FIXME? Maybe this could be a TableGen attribute on some registers and
10486// this table could be generated automatically from RegInfo.
10487Register AArch64TargetLowering::
10488getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10490 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10491 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10492 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10493 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10494 !MRI->isReservedReg(MF, Reg))
10495 Reg = 0;
10496 }
10497 if (Reg)
10498 return Reg;
10499 report_fatal_error(Twine("Invalid register name \""
10500 + StringRef(RegName) + "\"."));
10501}
10502
10503SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10504 SelectionDAG &DAG) const {
10506
10507 EVT VT = Op.getValueType();
10508 SDLoc DL(Op);
10509
10510 SDValue FrameAddr =
10511 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10513
10514 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10515}
10516
10517SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10518 SelectionDAG &DAG) const {
10520 MachineFrameInfo &MFI = MF.getFrameInfo();
10521 MFI.setReturnAddressIsTaken(true);
10522
10523 EVT VT = Op.getValueType();
10524 SDLoc DL(Op);
10525 unsigned Depth = Op.getConstantOperandVal(0);
10526 SDValue ReturnAddress;
10527 if (Depth) {
10528 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10530 ReturnAddress = DAG.getLoad(
10531 VT, DL, DAG.getEntryNode(),
10532 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10533 } else {
10534 // Return LR, which contains the return address. Mark it an implicit
10535 // live-in.
10536 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10537 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10538 }
10539
10540 // The XPACLRI instruction assembles to a hint-space instruction before
10541 // Armv8.3-A therefore this instruction can be safely used for any pre
10542 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10543 // that instead.
10544 SDNode *St;
10545 if (Subtarget->hasPAuth()) {
10546 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10547 } else {
10548 // XPACLRI operates on LR therefore we must move the operand accordingly.
10549 SDValue Chain =
10550 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10551 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10552 }
10553 return SDValue(St, 0);
10554}
10555
10556/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10557/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10558SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10559 SelectionDAG &DAG) const {
10560 SDValue Lo, Hi;
10561 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10562 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10563}
10564
10566 const GlobalAddressSDNode *GA) const {
10567 // Offsets are folded in the DAG combine rather than here so that we can
10568 // intelligently choose an offset based on the uses.
10569 return false;
10570}
10571
10573 bool OptForSize) const {
10574 bool IsLegal = false;
10575 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10576 // 16-bit case when target has full fp16 support.
10577 // We encode bf16 bit patterns as if they were fp16. This results in very
10578 // strange looking assembly but should populate the register with appropriate
10579 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10580 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10581 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10582 // FIXME: We should be able to handle f128 as well with a clever lowering.
10583 const APInt ImmInt = Imm.bitcastToAPInt();
10584 if (VT == MVT::f64)
10585 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10586 else if (VT == MVT::f32)
10587 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10588 else if (VT == MVT::f16 || VT == MVT::bf16)
10589 IsLegal =
10590 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10591 Imm.isPosZero();
10592
10593 // If we can not materialize in immediate field for fmov, check if the
10594 // value can be encoded as the immediate operand of a logical instruction.
10595 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10596 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10597 // generate that fmov.
10598 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10599 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10600 // however the mov+fmov sequence is always better because of the reduced
10601 // cache pressure. The timings are still the same if you consider
10602 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10603 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10606 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10607 IsLegal = Insn.size() <= Limit;
10608 }
10609
10610 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10611 << " imm value: "; Imm.dump(););
10612 return IsLegal;
10613}
10614
10615//===----------------------------------------------------------------------===//
10616// AArch64 Optimization Hooks
10617//===----------------------------------------------------------------------===//
10618
10619static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10620 SDValue Operand, SelectionDAG &DAG,
10621 int &ExtraSteps) {
10622 EVT VT = Operand.getValueType();
10623 if ((ST->hasNEON() &&
10624 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10625 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10626 VT == MVT::v4f32)) ||
10627 (ST->hasSVE() &&
10628 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10630 // For the reciprocal estimates, convergence is quadratic, so the number
10631 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10632 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10633 // the result for float (23 mantissa bits) is 2 and for double (52
10634 // mantissa bits) is 3.
10635 constexpr unsigned AccurateBits = 8;
10636 unsigned DesiredBits =
10638 ExtraSteps = DesiredBits <= AccurateBits
10639 ? 0
10640 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10641 }
10642
10643 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10644 }
10645
10646 return SDValue();
10647}
10648
10649SDValue
10650AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10651 const DenormalMode &Mode) const {
10652 SDLoc DL(Op);
10653 EVT VT = Op.getValueType();
10654 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10655 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10656 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10657}
10658
10659SDValue
10660AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10661 SelectionDAG &DAG) const {
10662 return Op;
10663}
10664
10665SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10666 SelectionDAG &DAG, int Enabled,
10667 int &ExtraSteps,
10668 bool &UseOneConst,
10669 bool Reciprocal) const {
10671 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10672 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10673 DAG, ExtraSteps)) {
10674 SDLoc DL(Operand);
10675 EVT VT = Operand.getValueType();
10676
10678 Flags.setAllowReassociation(true);
10679
10680 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10681 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10682 for (int i = ExtraSteps; i > 0; --i) {
10683 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10684 Flags);
10685 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10686 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10687 }
10688 if (!Reciprocal)
10689 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10690
10691 ExtraSteps = 0;
10692 return Estimate;
10693 }
10694
10695 return SDValue();
10696}
10697
10698SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10699 SelectionDAG &DAG, int Enabled,
10700 int &ExtraSteps) const {
10702 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10703 DAG, ExtraSteps)) {
10704 SDLoc DL(Operand);
10705 EVT VT = Operand.getValueType();
10706
10708 Flags.setAllowReassociation(true);
10709
10710 // Newton reciprocal iteration: E * (2 - X * E)
10711 // AArch64 reciprocal iteration instruction: (2 - M * N)
10712 for (int i = ExtraSteps; i > 0; --i) {
10713 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10714 Estimate, Flags);
10715 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10716 }
10717
10718 ExtraSteps = 0;
10719 return Estimate;
10720 }
10721
10722 return SDValue();
10723}
10724
10725//===----------------------------------------------------------------------===//
10726// AArch64 Inline Assembly Support
10727//===----------------------------------------------------------------------===//
10728
10729// Table of Constraints
10730// TODO: This is the current set of constraints supported by ARM for the
10731// compiler, not all of them may make sense.
10732//
10733// r - A general register
10734// w - An FP/SIMD register of some size in the range v0-v31
10735// x - An FP/SIMD register of some size in the range v0-v15
10736// I - Constant that can be used with an ADD instruction
10737// J - Constant that can be used with a SUB instruction
10738// K - Constant that can be used with a 32-bit logical instruction
10739// L - Constant that can be used with a 64-bit logical instruction
10740// M - Constant that can be used as a 32-bit MOV immediate
10741// N - Constant that can be used as a 64-bit MOV immediate
10742// Q - A memory reference with base register and no offset
10743// S - A symbolic address
10744// Y - Floating point constant zero
10745// Z - Integer constant zero
10746//
10747// Note that general register operands will be output using their 64-bit x
10748// register name, whatever the size of the variable, unless the asm operand
10749// is prefixed by the %w modifier. Floating-point and SIMD register operands
10750// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10751// %q modifier.
10752const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10753 // At this point, we have to lower this constraint to something else, so we
10754 // lower it to an "r" or "w". However, by doing this we will force the result
10755 // to be in register, while the X constraint is much more permissive.
10756 //
10757 // Although we are correct (we are free to emit anything, without
10758 // constraints), we might break use cases that would expect us to be more
10759 // efficient and emit something else.
10760 if (!Subtarget->hasFPARMv8())
10761 return "r";
10762
10763 if (ConstraintVT.isFloatingPoint())
10764 return "w";
10765
10766 if (ConstraintVT.isVector() &&
10767 (ConstraintVT.getSizeInBits() == 64 ||
10768 ConstraintVT.getSizeInBits() == 128))
10769 return "w";
10770
10771 return "r";
10772}
10773
10775
10776static std::optional<PredicateConstraint>
10779 .Case("Uph", PredicateConstraint::Uph)
10780 .Case("Upl", PredicateConstraint::Upl)
10781 .Case("Upa", PredicateConstraint::Upa)
10782 .Default(std::nullopt);
10783}
10784
10785static const TargetRegisterClass *
10787 if (VT != MVT::aarch64svcount &&
10788 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10789 return nullptr;
10790
10791 switch (Constraint) {
10792 case PredicateConstraint::Uph:
10793 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10794 : &AArch64::PPR_p8to15RegClass;
10795 case PredicateConstraint::Upl:
10796 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10797 : &AArch64::PPR_3bRegClass;
10798 case PredicateConstraint::Upa:
10799 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10800 : &AArch64::PPRRegClass;
10801 }
10802
10803 llvm_unreachable("Missing PredicateConstraint!");
10804}
10805
10807
10808static std::optional<ReducedGprConstraint>
10811 .Case("Uci", ReducedGprConstraint::Uci)
10812 .Case("Ucj", ReducedGprConstraint::Ucj)
10813 .Default(std::nullopt);
10814}
10815
10816static const TargetRegisterClass *
10818 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10819 return nullptr;
10820
10821 switch (Constraint) {
10822 case ReducedGprConstraint::Uci:
10823 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10824 case ReducedGprConstraint::Ucj:
10825 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10826 }
10827
10828 llvm_unreachable("Missing ReducedGprConstraint!");
10829}
10830
10831// The set of cc code supported is from
10832// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10835 .Case("{@cchi}", AArch64CC::HI)
10836 .Case("{@cccs}", AArch64CC::HS)
10837 .Case("{@cclo}", AArch64CC::LO)
10838 .Case("{@ccls}", AArch64CC::LS)
10839 .Case("{@cccc}", AArch64CC::LO)
10840 .Case("{@cceq}", AArch64CC::EQ)
10841 .Case("{@ccgt}", AArch64CC::GT)
10842 .Case("{@ccge}", AArch64CC::GE)
10843 .Case("{@cclt}", AArch64CC::LT)
10844 .Case("{@ccle}", AArch64CC::LE)
10845 .Case("{@cchs}", AArch64CC::HS)
10846 .Case("{@ccne}", AArch64CC::NE)
10847 .Case("{@ccvc}", AArch64CC::VC)
10848 .Case("{@ccpl}", AArch64CC::PL)
10849 .Case("{@ccvs}", AArch64CC::VS)
10850 .Case("{@ccmi}", AArch64CC::MI)
10852 return Cond;
10853}
10854
10855/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10856/// WZR, invert(<cond>)'.
10858 SelectionDAG &DAG) {
10859 return DAG.getNode(
10860 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10861 DAG.getConstant(0, DL, MVT::i32),
10862 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10863}
10864
10865// Lower @cc flag output via getSETCC.
10866SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10867 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10868 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10869 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10870 if (Cond == AArch64CC::Invalid)
10871 return SDValue();
10872 // The output variable should be a scalar integer.
10873 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10874 OpInfo.ConstraintVT.getSizeInBits() < 8)
10875 report_fatal_error("Flag output operand is of invalid type");
10876
10877 // Get NZCV register. Only update chain when copyfrom is glued.
10878 if (Glue.getNode()) {
10879 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10880 Chain = Glue.getValue(1);
10881 } else
10882 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10883 // Extract CC code.
10884 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10885
10887
10888 // Truncate or ZERO_EXTEND based on value types.
10889 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10890 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10891 else
10892 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10893
10894 return Result;
10895}
10896
10897/// getConstraintType - Given a constraint letter, return the type of
10898/// constraint it is for this target.
10900AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10901 if (Constraint.size() == 1) {
10902 switch (Constraint[0]) {
10903 default:
10904 break;
10905 case 'x':
10906 case 'w':
10907 case 'y':
10908 return C_RegisterClass;
10909 // An address with a single base register. Due to the way we
10910 // currently handle addresses it is the same as 'r'.
10911 case 'Q':
10912 return C_Memory;
10913 case 'I':
10914 case 'J':
10915 case 'K':
10916 case 'L':
10917 case 'M':
10918 case 'N':
10919 case 'Y':
10920 case 'Z':
10921 return C_Immediate;
10922 case 'z':
10923 case 'S': // A symbol or label reference with a constant offset
10924 return C_Other;
10925 }
10926 } else if (parsePredicateConstraint(Constraint))
10927 return C_RegisterClass;
10928 else if (parseReducedGprConstraint(Constraint))
10929 return C_RegisterClass;
10930 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10931 return C_Other;
10932 return TargetLowering::getConstraintType(Constraint);
10933}
10934
10935/// Examine constraint type and operand type and determine a weight value.
10936/// This object must already have been set up with the operand type
10937/// and the current alternative constraint selected.
10939AArch64TargetLowering::getSingleConstraintMatchWeight(
10940 AsmOperandInfo &info, const char *constraint) const {
10942 Value *CallOperandVal = info.CallOperandVal;
10943 // If we don't have a value, we can't do a match,
10944 // but allow it at the lowest weight.
10945 if (!CallOperandVal)
10946 return CW_Default;
10947 Type *type = CallOperandVal->getType();
10948 // Look at the constraint type.
10949 switch (*constraint) {
10950 default:
10952 break;
10953 case 'x':
10954 case 'w':
10955 case 'y':
10956 if (type->isFloatingPointTy() || type->isVectorTy())
10957 weight = CW_Register;
10958 break;
10959 case 'z':
10960 weight = CW_Constant;
10961 break;
10962 case 'U':
10963 if (parsePredicateConstraint(constraint) ||
10964 parseReducedGprConstraint(constraint))
10965 weight = CW_Register;
10966 break;
10967 }
10968 return weight;
10969}
10970
10971std::pair<unsigned, const TargetRegisterClass *>
10972AArch64TargetLowering::getRegForInlineAsmConstraint(
10973 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10974 if (Constraint.size() == 1) {
10975 switch (Constraint[0]) {
10976 case 'r':
10977 if (VT.isScalableVector())
10978 return std::make_pair(0U, nullptr);
10979 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10980 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10981 if (VT.getFixedSizeInBits() == 64)
10982 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10983 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10984 case 'w': {
10985 if (!Subtarget->hasFPARMv8())
10986 break;
10987 if (VT.isScalableVector()) {
10988 if (VT.getVectorElementType() != MVT::i1)
10989 return std::make_pair(0U, &AArch64::ZPRRegClass);
10990 return std::make_pair(0U, nullptr);
10991 }
10992 uint64_t VTSize = VT.getFixedSizeInBits();
10993 if (VTSize == 16)
10994 return std::make_pair(0U, &AArch64::FPR16RegClass);
10995 if (VTSize == 32)
10996 return std::make_pair(0U, &AArch64::FPR32RegClass);
10997 if (VTSize == 64)
10998 return std::make_pair(0U, &AArch64::FPR64RegClass);
10999 if (VTSize == 128)
11000 return std::make_pair(0U, &AArch64::FPR128RegClass);
11001 break;
11002 }
11003 // The instructions that this constraint is designed for can
11004 // only take 128-bit registers so just use that regclass.
11005 case 'x':
11006 if (!Subtarget->hasFPARMv8())
11007 break;
11008 if (VT.isScalableVector())
11009 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11010 if (VT.getSizeInBits() == 128)
11011 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11012 break;
11013 case 'y':
11014 if (!Subtarget->hasFPARMv8())
11015 break;
11016 if (VT.isScalableVector())
11017 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11018 break;
11019 }
11020 } else {
11021 if (const auto PC = parsePredicateConstraint(Constraint))
11022 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11023 return std::make_pair(0U, RegClass);
11024
11025 if (const auto RGC = parseReducedGprConstraint(Constraint))
11026 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11027 return std::make_pair(0U, RegClass);
11028 }
11029 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11031 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11032
11033 if (Constraint == "{za}") {
11034 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11035 }
11036
11037 if (Constraint == "{zt0}") {
11038 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11039 }
11040
11041 // Use the default implementation in TargetLowering to convert the register
11042 // constraint into a member of a register class.
11043 std::pair<unsigned, const TargetRegisterClass *> Res;
11045
11046 // Not found as a standard register?
11047 if (!Res.second) {
11048 unsigned Size = Constraint.size();
11049 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11050 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11051 int RegNo;
11052 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11053 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11054 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11055 // By default we'll emit v0-v31 for this unless there's a modifier where
11056 // we'll emit the correct register as well.
11057 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11058 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11059 Res.second = &AArch64::FPR64RegClass;
11060 } else {
11061 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11062 Res.second = &AArch64::FPR128RegClass;
11063 }
11064 }
11065 }
11066 }
11067
11068 if (Res.second && !Subtarget->hasFPARMv8() &&
11069 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11070 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11071 return std::make_pair(0U, nullptr);
11072
11073 return Res;
11074}
11075
11077 llvm::Type *Ty,
11078 bool AllowUnknown) const {
11079 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11080 return EVT(MVT::i64x8);
11081
11082 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11083}
11084
11085/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11086/// vector. If it is invalid, don't add anything to Ops.
11087void AArch64TargetLowering::LowerAsmOperandForConstraint(
11088 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11089 SelectionDAG &DAG) const {
11090 SDValue Result;
11091
11092 // Currently only support length 1 constraints.
11093 if (Constraint.size() != 1)
11094 return;
11095
11096 char ConstraintLetter = Constraint[0];
11097 switch (ConstraintLetter) {
11098 default:
11099 break;
11100
11101 // This set of constraints deal with valid constants for various instructions.
11102 // Validate and return a target constant for them if we can.
11103 case 'z': {
11104 // 'z' maps to xzr or wzr so it needs an input of 0.
11105 if (!isNullConstant(Op))
11106 return;
11107
11108 if (Op.getValueType() == MVT::i64)
11109 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11110 else
11111 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11112 break;
11113 }
11114 case 'S':
11115 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11116 // supported for PIC while "s" isn't, making "s" less useful. We implement
11117 // "S" but not "s".
11119 break;
11120
11121 case 'I':
11122 case 'J':
11123 case 'K':
11124 case 'L':
11125 case 'M':
11126 case 'N':
11127 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11128 if (!C)
11129 return;
11130
11131 // Grab the value and do some validation.
11132 uint64_t CVal = C->getZExtValue();
11133 switch (ConstraintLetter) {
11134 // The I constraint applies only to simple ADD or SUB immediate operands:
11135 // i.e. 0 to 4095 with optional shift by 12
11136 // The J constraint applies only to ADD or SUB immediates that would be
11137 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11138 // instruction [or vice versa], in other words -1 to -4095 with optional
11139 // left shift by 12.
11140 case 'I':
11141 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11142 break;
11143 return;
11144 case 'J': {
11145 uint64_t NVal = -C->getSExtValue();
11146 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11147 CVal = C->getSExtValue();
11148 break;
11149 }
11150 return;
11151 }
11152 // The K and L constraints apply *only* to logical immediates, including
11153 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11154 // been removed and MOV should be used). So these constraints have to
11155 // distinguish between bit patterns that are valid 32-bit or 64-bit
11156 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11157 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11158 // versa.
11159 case 'K':
11160 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11161 break;
11162 return;
11163 case 'L':
11164 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11165 break;
11166 return;
11167 // The M and N constraints are a superset of K and L respectively, for use
11168 // with the MOV (immediate) alias. As well as the logical immediates they
11169 // also match 32 or 64-bit immediates that can be loaded either using a
11170 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11171 // (M) or 64-bit 0x1234000000000000 (N) etc.
11172 // As a note some of this code is liberally stolen from the asm parser.
11173 case 'M': {
11174 if (!isUInt<32>(CVal))
11175 return;
11176 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11177 break;
11178 if ((CVal & 0xFFFF) == CVal)
11179 break;
11180 if ((CVal & 0xFFFF0000ULL) == CVal)
11181 break;
11182 uint64_t NCVal = ~(uint32_t)CVal;
11183 if ((NCVal & 0xFFFFULL) == NCVal)
11184 break;
11185 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11186 break;
11187 return;
11188 }
11189 case 'N': {
11190 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11191 break;
11192 if ((CVal & 0xFFFFULL) == CVal)
11193 break;
11194 if ((CVal & 0xFFFF0000ULL) == CVal)
11195 break;
11196 if ((CVal & 0xFFFF00000000ULL) == CVal)
11197 break;
11198 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11199 break;
11200 uint64_t NCVal = ~CVal;
11201 if ((NCVal & 0xFFFFULL) == NCVal)
11202 break;
11203 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11204 break;
11205 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11206 break;
11207 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11208 break;
11209 return;
11210 }
11211 default:
11212 return;
11213 }
11214
11215 // All assembler immediates are 64-bit integers.
11216 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11217 break;
11218 }
11219
11220 if (Result.getNode()) {
11221 Ops.push_back(Result);
11222 return;
11223 }
11224
11225 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11226}
11227
11228//===----------------------------------------------------------------------===//
11229// AArch64 Advanced SIMD Support
11230//===----------------------------------------------------------------------===//
11231
11232/// WidenVector - Given a value in the V64 register class, produce the
11233/// equivalent value in the V128 register class.
11235 EVT VT = V64Reg.getValueType();
11236 unsigned NarrowSize = VT.getVectorNumElements();
11237 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11238 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11239 SDLoc DL(V64Reg);
11240
11241 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11242 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11243}
11244
11245/// getExtFactor - Determine the adjustment factor for the position when
11246/// generating an "extract from vector registers" instruction.
11247static unsigned getExtFactor(SDValue &V) {
11248 EVT EltType = V.getValueType().getVectorElementType();
11249 return EltType.getSizeInBits() / 8;
11250}
11251
11252// Check if a vector is built from one vector via extracted elements of
11253// another together with an AND mask, ensuring that all elements fit
11254// within range. This can be reconstructed using AND and NEON's TBL1.
11256 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11257 SDLoc dl(Op);
11258 EVT VT = Op.getValueType();
11259 assert(!VT.isScalableVector() &&
11260 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11261
11262 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11263 // directly to TBL1.
11264 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11265 return SDValue();
11266
11267 unsigned NumElts = VT.getVectorNumElements();
11268 assert((NumElts == 8 || NumElts == 16) &&
11269 "Need to have exactly 8 or 16 elements in vector.");
11270
11271 SDValue SourceVec;
11272 SDValue MaskSourceVec;
11273 SmallVector<SDValue, 16> AndMaskConstants;
11274
11275 for (unsigned i = 0; i < NumElts; ++i) {
11276 SDValue V = Op.getOperand(i);
11277 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11278 return SDValue();
11279
11280 SDValue OperandSourceVec = V.getOperand(0);
11281 if (!SourceVec)
11282 SourceVec = OperandSourceVec;
11283 else if (SourceVec != OperandSourceVec)
11284 return SDValue();
11285
11286 // This only looks at shuffles with elements that are
11287 // a) truncated by a constant AND mask extracted from a mask vector, or
11288 // b) extracted directly from a mask vector.
11289 SDValue MaskSource = V.getOperand(1);
11290 if (MaskSource.getOpcode() == ISD::AND) {
11291 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11292 return SDValue();
11293
11294 AndMaskConstants.push_back(MaskSource.getOperand(1));
11295 MaskSource = MaskSource->getOperand(0);
11296 } else if (!AndMaskConstants.empty()) {
11297 // Either all or no operands should have an AND mask.
11298 return SDValue();
11299 }
11300
11301 // An ANY_EXTEND may be inserted between the AND and the source vector
11302 // extraction. We don't care about that, so we can just skip it.
11303 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11304 MaskSource = MaskSource.getOperand(0);
11305
11306 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11307 return SDValue();
11308
11309 SDValue MaskIdx = MaskSource.getOperand(1);
11310 if (!isa<ConstantSDNode>(MaskIdx) ||
11311 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11312 return SDValue();
11313
11314 // We only apply this if all elements come from the same vector with the
11315 // same vector type.
11316 if (!MaskSourceVec) {
11317 MaskSourceVec = MaskSource->getOperand(0);
11318 if (MaskSourceVec.getValueType() != VT)
11319 return SDValue();
11320 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11321 return SDValue();
11322 }
11323 }
11324
11325 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11326 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11327 // insert, we know that the index in the mask must be smaller than the number
11328 // of elements in the source, or we would have an out-of-bounds access.
11329 if (NumElts == 8)
11330 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11331 DAG.getUNDEF(VT));
11332
11333 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11334 if (!AndMaskConstants.empty())
11335 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11336 DAG.getBuildVector(VT, dl, AndMaskConstants));
11337
11338 return DAG.getNode(
11340 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11341 MaskSourceVec);
11342}
11343
11344// Gather data to see if the operation can be modelled as a
11345// shuffle in combination with VEXTs.
11347 SelectionDAG &DAG) const {
11348 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11349 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11350 SDLoc dl(Op);
11351 EVT VT = Op.getValueType();
11352 assert(!VT.isScalableVector() &&
11353 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11354 unsigned NumElts = VT.getVectorNumElements();
11355
11356 struct ShuffleSourceInfo {
11357 SDValue Vec;
11358 unsigned MinElt;
11359 unsigned MaxElt;
11360
11361 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11362 // be compatible with the shuffle we intend to construct. As a result
11363 // ShuffleVec will be some sliding window into the original Vec.
11364 SDValue ShuffleVec;
11365
11366 // Code should guarantee that element i in Vec starts at element "WindowBase
11367 // + i * WindowScale in ShuffleVec".
11368 int WindowBase;
11369 int WindowScale;
11370
11371 ShuffleSourceInfo(SDValue Vec)
11372 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11373 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11374
11375 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11376 };
11377
11378 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11379 // node.
11381 for (unsigned i = 0; i < NumElts; ++i) {
11382 SDValue V = Op.getOperand(i);
11383 if (V.isUndef())
11384 continue;
11385 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11386 !isa<ConstantSDNode>(V.getOperand(1)) ||
11387 V.getOperand(0).getValueType().isScalableVector()) {
11388 LLVM_DEBUG(
11389 dbgs() << "Reshuffle failed: "
11390 "a shuffle can only come from building a vector from "
11391 "various elements of other fixed-width vectors, provided "
11392 "their indices are constant\n");
11393 return SDValue();
11394 }
11395
11396 // Add this element source to the list if it's not already there.
11397 SDValue SourceVec = V.getOperand(0);
11398 auto Source = find(Sources, SourceVec);
11399 if (Source == Sources.end())
11400 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11401
11402 // Update the minimum and maximum lane number seen.
11403 unsigned EltNo = V.getConstantOperandVal(1);
11404 Source->MinElt = std::min(Source->MinElt, EltNo);
11405 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11406 }
11407
11408 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11409 // better than moving to/from gpr registers for larger vectors.
11410 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11411 // Construct a mask for the tbl. We may need to adjust the index for types
11412 // larger than i8.
11414 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11415 for (unsigned I = 0; I < NumElts; ++I) {
11416 SDValue V = Op.getOperand(I);
11417 if (V.isUndef()) {
11418 for (unsigned OF = 0; OF < OutputFactor; OF++)
11419 Mask.push_back(-1);
11420 continue;
11421 }
11422 // Set the Mask lanes adjusted for the size of the input and output
11423 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11424 // output element, adjusted in their positions per input and output types.
11425 unsigned Lane = V.getConstantOperandVal(1);
11426 for (unsigned S = 0; S < Sources.size(); S++) {
11427 if (V.getOperand(0) == Sources[S].Vec) {
11428 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11429 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11430 for (unsigned OF = 0; OF < OutputFactor; OF++)
11431 Mask.push_back(InputBase + OF);
11432 break;
11433 }
11434 }
11435 }
11436
11437 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11438 // v16i8, and the TBLMask
11439 SmallVector<SDValue, 16> TBLOperands;
11440 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11441 ? Intrinsic::aarch64_neon_tbl3
11442 : Intrinsic::aarch64_neon_tbl4,
11443 dl, MVT::i32));
11444 for (unsigned i = 0; i < Sources.size(); i++) {
11445 SDValue Src = Sources[i].Vec;
11446 EVT SrcVT = Src.getValueType();
11447 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11448 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11449 "Expected a legally typed vector");
11450 if (SrcVT.is64BitVector())
11451 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11452 DAG.getUNDEF(MVT::v8i8));
11453 TBLOperands.push_back(Src);
11454 }
11455
11457 for (unsigned i = 0; i < Mask.size(); i++)
11458 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11459 assert((Mask.size() == 8 || Mask.size() == 16) &&
11460 "Expected a v8i8 or v16i8 Mask");
11461 TBLOperands.push_back(
11462 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11463
11464 SDValue Shuffle =
11466 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11467 return DAG.getBitcast(VT, Shuffle);
11468 }
11469
11470 if (Sources.size() > 2) {
11471 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11472 << "sensible when at most two source vectors are "
11473 << "involved\n");
11474 return SDValue();
11475 }
11476
11477 // Find out the smallest element size among result and two sources, and use
11478 // it as element size to build the shuffle_vector.
11479 EVT SmallestEltTy = VT.getVectorElementType();
11480 for (auto &Source : Sources) {
11481 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11482 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11483 SmallestEltTy = SrcEltTy;
11484 }
11485 }
11486 unsigned ResMultiplier =
11487 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11488 uint64_t VTSize = VT.getFixedSizeInBits();
11489 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11490 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11491
11492 // If the source vector is too wide or too narrow, we may nevertheless be able
11493 // to construct a compatible shuffle either by concatenating it with UNDEF or
11494 // extracting a suitable range of elements.
11495 for (auto &Src : Sources) {
11496 EVT SrcVT = Src.ShuffleVec.getValueType();
11497
11498 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11499 if (SrcVTSize == TypeSize::getFixed(VTSize))
11500 continue;
11501
11502 // This stage of the search produces a source with the same element type as
11503 // the original, but with a total width matching the BUILD_VECTOR output.
11504 EVT EltVT = SrcVT.getVectorElementType();
11505 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11506 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11507
11508 if (SrcVTSize.getFixedValue() < VTSize) {
11509 assert(2 * SrcVTSize == VTSize);
11510 // We can pad out the smaller vector for free, so if it's part of a
11511 // shuffle...
11512 Src.ShuffleVec =
11513 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11514 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11515 continue;
11516 }
11517
11518 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11519 LLVM_DEBUG(
11520 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11521 return SDValue();
11522 }
11523
11524 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11525 LLVM_DEBUG(
11526 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11527 return SDValue();
11528 }
11529
11530 if (Src.MinElt >= NumSrcElts) {
11531 // The extraction can just take the second half
11532 Src.ShuffleVec =
11533 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11534 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11535 Src.WindowBase = -NumSrcElts;
11536 } else if (Src.MaxElt < NumSrcElts) {
11537 // The extraction can just take the first half
11538 Src.ShuffleVec =
11539 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11540 DAG.getConstant(0, dl, MVT::i64));
11541 } else {
11542 // An actual VEXT is needed
11543 SDValue VEXTSrc1 =
11544 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11545 DAG.getConstant(0, dl, MVT::i64));
11546 SDValue VEXTSrc2 =
11547 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11548 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11549 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11550
11551 if (!SrcVT.is64BitVector()) {
11552 LLVM_DEBUG(
11553 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11554 "for SVE vectors.");
11555 return SDValue();
11556 }
11557
11558 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11559 VEXTSrc2,
11560 DAG.getConstant(Imm, dl, MVT::i32));
11561 Src.WindowBase = -Src.MinElt;
11562 }
11563 }
11564
11565 // Another possible incompatibility occurs from the vector element types. We
11566 // can fix this by bitcasting the source vectors to the same type we intend
11567 // for the shuffle.
11568 for (auto &Src : Sources) {
11569 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11570 if (SrcEltTy == SmallestEltTy)
11571 continue;
11572 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11573 if (DAG.getDataLayout().isBigEndian()) {
11574 Src.ShuffleVec =
11575 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11576 } else {
11577 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11578 }
11579 Src.WindowScale =
11580 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11581 Src.WindowBase *= Src.WindowScale;
11582 }
11583
11584 // Final check before we try to actually produce a shuffle.
11585 LLVM_DEBUG(for (auto Src
11586 : Sources)
11587 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11588
11589 // The stars all align, our next step is to produce the mask for the shuffle.
11590 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11591 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11592 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11593 SDValue Entry = Op.getOperand(i);
11594 if (Entry.isUndef())
11595 continue;
11596
11597 auto Src = find(Sources, Entry.getOperand(0));
11598 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11599
11600 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11601 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11602 // segment.
11603 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11604 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11605 VT.getScalarSizeInBits());
11606 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11607
11608 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11609 // starting at the appropriate offset.
11610 int *LaneMask = &Mask[i * ResMultiplier];
11611
11612 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11613 ExtractBase += NumElts * (Src - Sources.begin());
11614 for (int j = 0; j < LanesDefined; ++j)
11615 LaneMask[j] = ExtractBase + j;
11616 }
11617
11618 // Final check before we try to produce nonsense...
11619 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11620 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11621 return SDValue();
11622 }
11623
11624 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11625 for (unsigned i = 0; i < Sources.size(); ++i)
11626 ShuffleOps[i] = Sources[i].ShuffleVec;
11627
11628 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11629 ShuffleOps[1], Mask);
11630 SDValue V;
11631 if (DAG.getDataLayout().isBigEndian()) {
11632 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11633 } else {
11634 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11635 }
11636
11637 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11638 dbgs() << "Reshuffle, creating node: "; V.dump(););
11639
11640 return V;
11641}
11642
11643// check if an EXT instruction can handle the shuffle mask when the
11644// vector sources of the shuffle are the same.
11645static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11646 unsigned NumElts = VT.getVectorNumElements();
11647
11648 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11649 if (M[0] < 0)
11650 return false;
11651
11652 Imm = M[0];
11653
11654 // If this is a VEXT shuffle, the immediate value is the index of the first
11655 // element. The other shuffle indices must be the successive elements after
11656 // the first one.
11657 unsigned ExpectedElt = Imm;
11658 for (unsigned i = 1; i < NumElts; ++i) {
11659 // Increment the expected index. If it wraps around, just follow it
11660 // back to index zero and keep going.
11661 ++ExpectedElt;
11662 if (ExpectedElt == NumElts)
11663 ExpectedElt = 0;
11664
11665 if (M[i] < 0)
11666 continue; // ignore UNDEF indices
11667 if (ExpectedElt != static_cast<unsigned>(M[i]))
11668 return false;
11669 }
11670
11671 return true;
11672}
11673
11674// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11675// v4i32s. This is really a truncate, which we can construct out of (legal)
11676// concats and truncate nodes.
11678 if (V.getValueType() != MVT::v16i8)
11679 return SDValue();
11680 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11681
11682 for (unsigned X = 0; X < 4; X++) {
11683 // Check the first item in each group is an extract from lane 0 of a v4i32
11684 // or v4i16.
11685 SDValue BaseExt = V.getOperand(X * 4);
11686 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11687 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11688 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11689 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11690 BaseExt.getConstantOperandVal(1) != 0)
11691 return SDValue();
11692 SDValue Base = BaseExt.getOperand(0);
11693 // And check the other items are extracts from the same vector.
11694 for (unsigned Y = 1; Y < 4; Y++) {
11695 SDValue Ext = V.getOperand(X * 4 + Y);
11696 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11697 Ext.getOperand(0) != Base ||
11698 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11699 Ext.getConstantOperandVal(1) != Y)
11700 return SDValue();
11701 }
11702 }
11703
11704 // Turn the buildvector into a series of truncates and concates, which will
11705 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11706 // concat together to produce 2 v8i16. These are both truncated and concat
11707 // together.
11708 SDLoc DL(V);
11709 SDValue Trunc[4] = {
11710 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11711 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11712 for (SDValue &V : Trunc)
11713 if (V.getValueType() == MVT::v4i32)
11714 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11715 SDValue Concat0 =
11716 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11717 SDValue Concat1 =
11718 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11719 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11720 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11721 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11722}
11723
11724/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11725/// element width than the vector lane type. If that is the case the function
11726/// returns true and writes the value of the DUP instruction lane operand into
11727/// DupLaneOp
11728static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11729 unsigned &DupLaneOp) {
11730 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11731 "Only possible block sizes for wide DUP are: 16, 32, 64");
11732
11733 if (BlockSize <= VT.getScalarSizeInBits())
11734 return false;
11735 if (BlockSize % VT.getScalarSizeInBits() != 0)
11736 return false;
11737 if (VT.getSizeInBits() % BlockSize != 0)
11738 return false;
11739
11740 size_t SingleVecNumElements = VT.getVectorNumElements();
11741 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11742 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11743
11744 // We are looking for masks like
11745 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11746 // might be replaced by 'undefined'. BlockIndices will eventually contain
11747 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11748 // for the above examples)
11749 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11750 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11751 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11752 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11753 if (Elt < 0)
11754 continue;
11755 // For now we don't support shuffles that use the second operand
11756 if ((unsigned)Elt >= SingleVecNumElements)
11757 return false;
11758 if (BlockElts[I] < 0)
11759 BlockElts[I] = Elt;
11760 else if (BlockElts[I] != Elt)
11761 return false;
11762 }
11763
11764 // We found a candidate block (possibly with some undefs). It must be a
11765 // sequence of consecutive integers starting with a value divisible by
11766 // NumEltsPerBlock with some values possibly replaced by undef-s.
11767
11768 // Find first non-undef element
11769 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11770 assert(FirstRealEltIter != BlockElts.end() &&
11771 "Shuffle with all-undefs must have been caught by previous cases, "
11772 "e.g. isSplat()");
11773 if (FirstRealEltIter == BlockElts.end()) {
11774 DupLaneOp = 0;
11775 return true;
11776 }
11777
11778 // Index of FirstRealElt in BlockElts
11779 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11780
11781 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11782 return false;
11783 // BlockElts[0] must have the following value if it isn't undef:
11784 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11785
11786 // Check the first element
11787 if (Elt0 % NumEltsPerBlock != 0)
11788 return false;
11789 // Check that the sequence indeed consists of consecutive integers (modulo
11790 // undefs)
11791 for (size_t I = 0; I < NumEltsPerBlock; I++)
11792 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11793 return false;
11794
11795 DupLaneOp = Elt0 / NumEltsPerBlock;
11796 return true;
11797}
11798
11799// check if an EXT instruction can handle the shuffle mask when the
11800// vector sources of the shuffle are different.
11801static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11802 unsigned &Imm) {
11803 // Look for the first non-undef element.
11804 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11805
11806 // Benefit form APInt to handle overflow when calculating expected element.
11807 unsigned NumElts = VT.getVectorNumElements();
11808 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11809 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11810 // The following shuffle indices must be the successive elements after the
11811 // first real element.
11812 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11813 return Elt != ExpectedElt++ && Elt != -1;
11814 });
11815 if (FoundWrongElt)
11816 return false;
11817
11818 // The index of an EXT is the first element if it is not UNDEF.
11819 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11820 // value of the first element. E.g.
11821 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11822 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11823 // ExpectedElt is the last mask index plus 1.
11824 Imm = ExpectedElt.getZExtValue();
11825
11826 // There are two difference cases requiring to reverse input vectors.
11827 // For example, for vector <4 x i32> we have the following cases,
11828 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11829 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11830 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11831 // to reverse two input vectors.
11832 if (Imm < NumElts)
11833 ReverseEXT = true;
11834 else
11835 Imm -= NumElts;
11836
11837 return true;
11838}
11839
11840/// isREVMask - Check if a vector shuffle corresponds to a REV
11841/// instruction with the specified blocksize. (The order of the elements
11842/// within each block of the vector is reversed.)
11843static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11844 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11845 BlockSize == 128) &&
11846 "Only possible block sizes for REV are: 16, 32, 64, 128");
11847
11848 unsigned EltSz = VT.getScalarSizeInBits();
11849 unsigned NumElts = VT.getVectorNumElements();
11850 unsigned BlockElts = M[0] + 1;
11851 // If the first shuffle index is UNDEF, be optimistic.
11852 if (M[0] < 0)
11853 BlockElts = BlockSize / EltSz;
11854
11855 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11856 return false;
11857
11858 for (unsigned i = 0; i < NumElts; ++i) {
11859 if (M[i] < 0)
11860 continue; // ignore UNDEF indices
11861 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11862 return false;
11863 }
11864
11865 return true;
11866}
11867
11868static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11869 unsigned NumElts = VT.getVectorNumElements();
11870 if (NumElts % 2 != 0)
11871 return false;
11872 WhichResult = (M[0] == 0 ? 0 : 1);
11873 unsigned Idx = WhichResult * NumElts / 2;
11874 for (unsigned i = 0; i != NumElts; i += 2) {
11875 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11876 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
11877 return false;
11878 Idx += 1;
11879 }
11880
11881 return true;
11882}
11883
11884static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11885 unsigned NumElts = VT.getVectorNumElements();
11886 WhichResult = (M[0] == 0 ? 0 : 1);
11887 for (unsigned i = 0; i != NumElts; ++i) {
11888 if (M[i] < 0)
11889 continue; // ignore UNDEF indices
11890 if ((unsigned)M[i] != 2 * i + WhichResult)
11891 return false;
11892 }
11893
11894 return true;
11895}
11896
11897static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11898 unsigned NumElts = VT.getVectorNumElements();
11899 if (NumElts % 2 != 0)
11900 return false;
11901 WhichResult = (M[0] == 0 ? 0 : 1);
11902 for (unsigned i = 0; i < NumElts; i += 2) {
11903 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11904 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11905 return false;
11906 }
11907 return true;
11908}
11909
11910/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11911/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11912/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11913static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11914 unsigned NumElts = VT.getVectorNumElements();
11915 if (NumElts % 2 != 0)
11916 return false;
11917 WhichResult = (M[0] == 0 ? 0 : 1);
11918 unsigned Idx = WhichResult * NumElts / 2;
11919 for (unsigned i = 0; i != NumElts; i += 2) {
11920 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11921 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11922 return false;
11923 Idx += 1;
11924 }
11925
11926 return true;
11927}
11928
11929/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11930/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11931/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11932static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11933 unsigned Half = VT.getVectorNumElements() / 2;
11934 WhichResult = (M[0] == 0 ? 0 : 1);
11935 for (unsigned j = 0; j != 2; ++j) {
11936 unsigned Idx = WhichResult;
11937 for (unsigned i = 0; i != Half; ++i) {
11938 int MIdx = M[i + j * Half];
11939 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11940 return false;
11941 Idx += 2;
11942 }
11943 }
11944
11945 return true;
11946}
11947
11948/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11949/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11950/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11951static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11952 unsigned NumElts = VT.getVectorNumElements();
11953 if (NumElts % 2 != 0)
11954 return false;
11955 WhichResult = (M[0] == 0 ? 0 : 1);
11956 for (unsigned i = 0; i < NumElts; i += 2) {
11957 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11958 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11959 return false;
11960 }
11961 return true;
11962}
11963
11964static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11965 bool &DstIsLeft, int &Anomaly) {
11966 if (M.size() != static_cast<size_t>(NumInputElements))
11967 return false;
11968
11969 int NumLHSMatch = 0, NumRHSMatch = 0;
11970 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11971
11972 for (int i = 0; i < NumInputElements; ++i) {
11973 if (M[i] == -1) {
11974 ++NumLHSMatch;
11975 ++NumRHSMatch;
11976 continue;
11977 }
11978
11979 if (M[i] == i)
11980 ++NumLHSMatch;
11981 else
11982 LastLHSMismatch = i;
11983
11984 if (M[i] == i + NumInputElements)
11985 ++NumRHSMatch;
11986 else
11987 LastRHSMismatch = i;
11988 }
11989
11990 if (NumLHSMatch == NumInputElements - 1) {
11991 DstIsLeft = true;
11992 Anomaly = LastLHSMismatch;
11993 return true;
11994 } else if (NumRHSMatch == NumInputElements - 1) {
11995 DstIsLeft = false;
11996 Anomaly = LastRHSMismatch;
11997 return true;
11998 }
11999
12000 return false;
12001}
12002
12003static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12004 if (VT.getSizeInBits() != 128)
12005 return false;
12006
12007 unsigned NumElts = VT.getVectorNumElements();
12008
12009 for (int I = 0, E = NumElts / 2; I != E; I++) {
12010 if (Mask[I] != I)
12011 return false;
12012 }
12013
12014 int Offset = NumElts / 2;
12015 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12016 if (Mask[I] != I + SplitLHS * Offset)
12017 return false;
12018 }
12019
12020 return true;
12021}
12022
12024 SDLoc DL(Op);
12025 EVT VT = Op.getValueType();
12026 SDValue V0 = Op.getOperand(0);
12027 SDValue V1 = Op.getOperand(1);
12028 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12029
12032 return SDValue();
12033
12034 bool SplitV0 = V0.getValueSizeInBits() == 128;
12035
12036 if (!isConcatMask(Mask, VT, SplitV0))
12037 return SDValue();
12038
12039 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12040 if (SplitV0) {
12041 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12042 DAG.getConstant(0, DL, MVT::i64));
12043 }
12044 if (V1.getValueSizeInBits() == 128) {
12045 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12046 DAG.getConstant(0, DL, MVT::i64));
12047 }
12048 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12049}
12050
12051/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12052/// the specified operations to build the shuffle. ID is the perfect-shuffle
12053//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12054//table entry and LHS/RHS are the immediate inputs for this stage of the
12055//shuffle.
12057 SDValue V2, unsigned PFEntry, SDValue LHS,
12058 SDValue RHS, SelectionDAG &DAG,
12059 const SDLoc &dl) {
12060 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12061 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12062 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12063
12064 enum {
12065 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12066 OP_VREV,
12067 OP_VDUP0,
12068 OP_VDUP1,
12069 OP_VDUP2,
12070 OP_VDUP3,
12071 OP_VEXT1,
12072 OP_VEXT2,
12073 OP_VEXT3,
12074 OP_VUZPL, // VUZP, left result
12075 OP_VUZPR, // VUZP, right result
12076 OP_VZIPL, // VZIP, left result
12077 OP_VZIPR, // VZIP, right result
12078 OP_VTRNL, // VTRN, left result
12079 OP_VTRNR, // VTRN, right result
12080 OP_MOVLANE // Move lane. RHSID is the lane to move into
12081 };
12082
12083 if (OpNum == OP_COPY) {
12084 if (LHSID == (1 * 9 + 2) * 9 + 3)
12085 return LHS;
12086 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12087 return RHS;
12088 }
12089
12090 if (OpNum == OP_MOVLANE) {
12091 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12092 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12093 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12094 Elt = 3 - Elt;
12095 while (Elt > 0) {
12096 ID /= 9;
12097 Elt--;
12098 }
12099 return (ID % 9 == 8) ? -1 : ID % 9;
12100 };
12101
12102 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12103 // get the lane to move from the PFID, which is always from the
12104 // original vectors (V1 or V2).
12106 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12107 EVT VT = OpLHS.getValueType();
12108 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12109 unsigned ExtLane = 0;
12110 SDValue Input;
12111
12112 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12113 // convert into a higher type.
12114 if (RHSID & 0x4) {
12115 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12116 if (MaskElt == -1)
12117 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12118 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12119 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12120 Input = MaskElt < 2 ? V1 : V2;
12121 if (VT.getScalarSizeInBits() == 16) {
12122 Input = DAG.getBitcast(MVT::v2f32, Input);
12123 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12124 } else {
12125 assert(VT.getScalarSizeInBits() == 32 &&
12126 "Expected 16 or 32 bit shuffle elemements");
12127 Input = DAG.getBitcast(MVT::v2f64, Input);
12128 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12129 }
12130 } else {
12131 int MaskElt = getPFIDLane(ID, RHSID);
12132 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12133 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12134 Input = MaskElt < 4 ? V1 : V2;
12135 // Be careful about creating illegal types. Use f16 instead of i16.
12136 if (VT == MVT::v4i16) {
12137 Input = DAG.getBitcast(MVT::v4f16, Input);
12138 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12139 }
12140 }
12143 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12144 SDValue Ins =
12145 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12146 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12147 return DAG.getBitcast(VT, Ins);
12148 }
12149
12150 SDValue OpLHS, OpRHS;
12151 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12152 RHS, DAG, dl);
12153 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12154 RHS, DAG, dl);
12155 EVT VT = OpLHS.getValueType();
12156
12157 switch (OpNum) {
12158 default:
12159 llvm_unreachable("Unknown shuffle opcode!");
12160 case OP_VREV:
12161 // VREV divides the vector in half and swaps within the half.
12162 if (VT.getVectorElementType() == MVT::i32 ||
12163 VT.getVectorElementType() == MVT::f32)
12164 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12165 // vrev <4 x i16> -> REV32
12166 if (VT.getVectorElementType() == MVT::i16 ||
12167 VT.getVectorElementType() == MVT::f16 ||
12168 VT.getVectorElementType() == MVT::bf16)
12169 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12170 // vrev <4 x i8> -> REV16
12171 assert(VT.getVectorElementType() == MVT::i8);
12172 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12173 case OP_VDUP0:
12174 case OP_VDUP1:
12175 case OP_VDUP2:
12176 case OP_VDUP3: {
12177 EVT EltTy = VT.getVectorElementType();
12178 unsigned Opcode;
12179 if (EltTy == MVT::i8)
12180 Opcode = AArch64ISD::DUPLANE8;
12181 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12182 Opcode = AArch64ISD::DUPLANE16;
12183 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12184 Opcode = AArch64ISD::DUPLANE32;
12185 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12186 Opcode = AArch64ISD::DUPLANE64;
12187 else
12188 llvm_unreachable("Invalid vector element type?");
12189
12190 if (VT.getSizeInBits() == 64)
12191 OpLHS = WidenVector(OpLHS, DAG);
12192 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12193 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12194 }
12195 case OP_VEXT1:
12196 case OP_VEXT2:
12197 case OP_VEXT3: {
12198 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12199 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12200 DAG.getConstant(Imm, dl, MVT::i32));
12201 }
12202 case OP_VUZPL:
12203 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12204 case OP_VUZPR:
12205 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12206 case OP_VZIPL:
12207 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12208 case OP_VZIPR:
12209 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12210 case OP_VTRNL:
12211 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12212 case OP_VTRNR:
12213 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12214 }
12215}
12216
12218 SelectionDAG &DAG) {
12219 // Check to see if we can use the TBL instruction.
12220 SDValue V1 = Op.getOperand(0);
12221 SDValue V2 = Op.getOperand(1);
12222 SDLoc DL(Op);
12223
12224 EVT EltVT = Op.getValueType().getVectorElementType();
12225 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12226
12227 bool Swap = false;
12228 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12229 std::swap(V1, V2);
12230 Swap = true;
12231 }
12232
12233 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12234 // out of range values with 0s. We do need to make sure that any out-of-range
12235 // values are really out-of-range for a v16i8 vector.
12236 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12237 MVT IndexVT = MVT::v8i8;
12238 unsigned IndexLen = 8;
12239 if (Op.getValueSizeInBits() == 128) {
12240 IndexVT = MVT::v16i8;
12241 IndexLen = 16;
12242 }
12243
12245 for (int Val : ShuffleMask) {
12246 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12247 unsigned Offset = Byte + Val * BytesPerElt;
12248 if (Swap)
12249 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12250 if (IsUndefOrZero && Offset >= IndexLen)
12251 Offset = 255;
12252 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12253 }
12254 }
12255
12256 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12257 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12258
12259 SDValue Shuffle;
12260 if (IsUndefOrZero) {
12261 if (IndexLen == 8)
12262 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12263 Shuffle = DAG.getNode(
12264 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12265 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12266 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12267 } else {
12268 if (IndexLen == 8) {
12269 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12270 Shuffle = DAG.getNode(
12271 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12272 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12273 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12274 } else {
12275 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12276 // cannot currently represent the register constraints on the input
12277 // table registers.
12278 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12279 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12280 // IndexLen));
12281 Shuffle = DAG.getNode(
12282 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12283 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12284 V2Cst,
12285 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12286 }
12287 }
12288 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12289}
12290
12291static unsigned getDUPLANEOp(EVT EltType) {
12292 if (EltType == MVT::i8)
12293 return AArch64ISD::DUPLANE8;
12294 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12295 return AArch64ISD::DUPLANE16;
12296 if (EltType == MVT::i32 || EltType == MVT::f32)
12297 return AArch64ISD::DUPLANE32;
12298 if (EltType == MVT::i64 || EltType == MVT::f64)
12299 return AArch64ISD::DUPLANE64;
12300
12301 llvm_unreachable("Invalid vector element type?");
12302}
12303
12304static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12305 unsigned Opcode, SelectionDAG &DAG) {
12306 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12307 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12308 // Match: dup (bitcast (extract_subv X, C)), LaneC
12309 if (BitCast.getOpcode() != ISD::BITCAST ||
12311 return false;
12312
12313 // The extract index must align in the destination type. That may not
12314 // happen if the bitcast is from narrow to wide type.
12315 SDValue Extract = BitCast.getOperand(0);
12316 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12317 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12318 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12319 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12320 if (ExtIdxInBits % CastedEltBitWidth != 0)
12321 return false;
12322
12323 // Can't handle cases where vector size is not 128-bit
12324 if (!Extract.getOperand(0).getValueType().is128BitVector())
12325 return false;
12326
12327 // Update the lane value by offsetting with the scaled extract index.
12328 LaneC += ExtIdxInBits / CastedEltBitWidth;
12329
12330 // Determine the casted vector type of the wide vector input.
12331 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12332 // Examples:
12333 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12334 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12335 unsigned SrcVecNumElts =
12336 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12338 SrcVecNumElts);
12339 return true;
12340 };
12341 MVT CastVT;
12342 if (getScaledOffsetDup(V, Lane, CastVT)) {
12343 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12344 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12345 V.getOperand(0).getValueType().is128BitVector()) {
12346 // The lane is incremented by the index of the extract.
12347 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12348 Lane += V.getConstantOperandVal(1);
12349 V = V.getOperand(0);
12350 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12351 // The lane is decremented if we are splatting from the 2nd operand.
12352 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12353 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12354 Lane -= Idx * VT.getVectorNumElements() / 2;
12355 V = WidenVector(V.getOperand(Idx), DAG);
12356 } else if (VT.getSizeInBits() == 64) {
12357 // Widen the operand to 128-bit register with undef.
12358 V = WidenVector(V, DAG);
12359 }
12360 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12361}
12362
12363// Return true if we can get a new shuffle mask by checking the parameter mask
12364// array to test whether every two adjacent mask values are continuous and
12365// starting from an even number.
12367 SmallVectorImpl<int> &NewMask) {
12368 unsigned NumElts = VT.getVectorNumElements();
12369 if (NumElts % 2 != 0)
12370 return false;
12371
12372 NewMask.clear();
12373 for (unsigned i = 0; i < NumElts; i += 2) {
12374 int M0 = M[i];
12375 int M1 = M[i + 1];
12376
12377 // If both elements are undef, new mask is undef too.
12378 if (M0 == -1 && M1 == -1) {
12379 NewMask.push_back(-1);
12380 continue;
12381 }
12382
12383 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12384 NewMask.push_back(M1 / 2);
12385 continue;
12386 }
12387
12388 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12389 NewMask.push_back(M0 / 2);
12390 continue;
12391 }
12392
12393 NewMask.clear();
12394 return false;
12395 }
12396
12397 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12398 return true;
12399}
12400
12401// Try to widen element type to get a new mask value for a better permutation
12402// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12403// UZP1/2, TRN1/2, REV, INS, etc.
12404// For example:
12405// shufflevector <4 x i32> %a, <4 x i32> %b,
12406// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12407// is equivalent to:
12408// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12409// Finally, we can get:
12410// mov v0.d[0], v1.d[1]
12412 SDLoc DL(Op);
12413 EVT VT = Op.getValueType();
12414 EVT ScalarVT = VT.getVectorElementType();
12415 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12416 SDValue V0 = Op.getOperand(0);
12417 SDValue V1 = Op.getOperand(1);
12418 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12419
12420 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12421 // We need to make sure the wider element type is legal. Thus, ElementSize
12422 // should be not larger than 32 bits, and i1 type should also be excluded.
12423 if (ElementSize > 32 || ElementSize == 1)
12424 return SDValue();
12425
12426 SmallVector<int, 8> NewMask;
12427 if (isWideTypeMask(Mask, VT, NewMask)) {
12428 MVT NewEltVT = VT.isFloatingPoint()
12429 ? MVT::getFloatingPointVT(ElementSize * 2)
12430 : MVT::getIntegerVT(ElementSize * 2);
12431 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12432 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12433 V0 = DAG.getBitcast(NewVT, V0);
12434 V1 = DAG.getBitcast(NewVT, V1);
12435 return DAG.getBitcast(VT,
12436 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12437 }
12438 }
12439
12440 return SDValue();
12441}
12442
12443// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12445 ArrayRef<int> ShuffleMask,
12446 SelectionDAG &DAG) {
12447 SDValue Tbl1 = Op->getOperand(0);
12448 SDValue Tbl2 = Op->getOperand(1);
12449 SDLoc dl(Op);
12450 SDValue Tbl2ID =
12451 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12452
12453 EVT VT = Op.getValueType();
12454 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12455 Tbl1->getOperand(0) != Tbl2ID ||
12457 Tbl2->getOperand(0) != Tbl2ID)
12458 return SDValue();
12459
12460 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12461 Tbl2->getValueType(0) != MVT::v16i8)
12462 return SDValue();
12463
12464 SDValue Mask1 = Tbl1->getOperand(3);
12465 SDValue Mask2 = Tbl2->getOperand(3);
12466 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12467 for (unsigned I = 0; I < 16; I++) {
12468 if (ShuffleMask[I] < 16)
12469 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12470 else {
12471 auto *C =
12472 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12473 if (!C)
12474 return SDValue();
12475 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12476 }
12477 }
12478
12479 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12480 SDValue ID =
12481 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12482
12483 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12484 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12485 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12486}
12487
12488// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12489// but we don't have an appropriate instruction,
12490// so custom-lower it as ZIP1-with-zeros.
12491SDValue
12492AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12493 SelectionDAG &DAG) const {
12494 SDLoc dl(Op);
12495 EVT VT = Op.getValueType();
12496 SDValue SrcOp = Op.getOperand(0);
12497 EVT SrcVT = SrcOp.getValueType();
12498 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12499 "Unexpected extension factor.");
12500 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12501 // FIXME: support multi-step zipping?
12502 if (Scale != 2)
12503 return SDValue();
12504 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12505 return DAG.getBitcast(VT,
12506 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12507}
12508
12509SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12510 SelectionDAG &DAG) const {
12511 SDLoc dl(Op);
12512 EVT VT = Op.getValueType();
12513
12514 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12515
12516 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12517 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12518
12519 // Convert shuffles that are directly supported on NEON to target-specific
12520 // DAG nodes, instead of keeping them as shuffles and matching them again
12521 // during code selection. This is more efficient and avoids the possibility
12522 // of inconsistencies between legalization and selection.
12523 ArrayRef<int> ShuffleMask = SVN->getMask();
12524
12525 SDValue V1 = Op.getOperand(0);
12526 SDValue V2 = Op.getOperand(1);
12527
12528 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12529 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12530 "Unexpected VECTOR_SHUFFLE mask size!");
12531
12532 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12533 return Res;
12534
12535 if (SVN->isSplat()) {
12536 int Lane = SVN->getSplatIndex();
12537 // If this is undef splat, generate it via "just" vdup, if possible.
12538 if (Lane == -1)
12539 Lane = 0;
12540
12541 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12542 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12543 V1.getOperand(0));
12544 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12545 // constant. If so, we can just reference the lane's definition directly.
12546 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12547 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12548 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12549
12550 // Otherwise, duplicate from the lane of the input vector.
12551 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12552 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12553 }
12554
12555 // Check if the mask matches a DUP for a wider element
12556 for (unsigned LaneSize : {64U, 32U, 16U}) {
12557 unsigned Lane = 0;
12558 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12559 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12560 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12562 // Cast V1 to an integer vector with required lane size
12563 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12564 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12565 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12566 V1 = DAG.getBitcast(NewVecTy, V1);
12567 // Constuct the DUP instruction
12568 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12569 // Cast back to the original type
12570 return DAG.getBitcast(VT, V1);
12571 }
12572 }
12573
12574 if (isREVMask(ShuffleMask, VT, 64))
12575 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12576 if (isREVMask(ShuffleMask, VT, 32))
12577 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12578 if (isREVMask(ShuffleMask, VT, 16))
12579 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12580
12581 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12582 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12583 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12584 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12585 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12586 DAG.getConstant(8, dl, MVT::i32));
12587 }
12588
12589 bool ReverseEXT = false;
12590 unsigned Imm;
12591 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12592 if (ReverseEXT)
12593 std::swap(V1, V2);
12594 Imm *= getExtFactor(V1);
12595 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12596 DAG.getConstant(Imm, dl, MVT::i32));
12597 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12598 Imm *= getExtFactor(V1);
12599 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12600 DAG.getConstant(Imm, dl, MVT::i32));
12601 }
12602
12603 unsigned WhichResult;
12604 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12605 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12606 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12607 }
12608 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12609 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12610 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12611 }
12612 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12613 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12614 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12615 }
12616
12617 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12618 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12619 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12620 }
12621 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12622 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12623 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12624 }
12625 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12626 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12627 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12628 }
12629
12631 return Concat;
12632
12633 bool DstIsLeft;
12634 int Anomaly;
12635 int NumInputElements = V1.getValueType().getVectorNumElements();
12636 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12637 SDValue DstVec = DstIsLeft ? V1 : V2;
12638 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12639
12640 SDValue SrcVec = V1;
12641 int SrcLane = ShuffleMask[Anomaly];
12642 if (SrcLane >= NumInputElements) {
12643 SrcVec = V2;
12644 SrcLane -= VT.getVectorNumElements();
12645 }
12646 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12647
12648 EVT ScalarVT = VT.getVectorElementType();
12649
12650 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12651 ScalarVT = MVT::i32;
12652
12653 return DAG.getNode(
12654 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12655 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12656 DstLaneV);
12657 }
12658
12659 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12660 return NewSD;
12661
12662 // If the shuffle is not directly supported and it has 4 elements, use
12663 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12664 unsigned NumElts = VT.getVectorNumElements();
12665 if (NumElts == 4) {
12666 unsigned PFIndexes[4];
12667 for (unsigned i = 0; i != 4; ++i) {
12668 if (ShuffleMask[i] < 0)
12669 PFIndexes[i] = 8;
12670 else
12671 PFIndexes[i] = ShuffleMask[i];
12672 }
12673
12674 // Compute the index in the perfect shuffle table.
12675 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12676 PFIndexes[2] * 9 + PFIndexes[3];
12677 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12678 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12679 dl);
12680 }
12681
12682 return GenerateTBL(Op, ShuffleMask, DAG);
12683}
12684
12685SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12686 SelectionDAG &DAG) const {
12687 EVT VT = Op.getValueType();
12688
12689 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12690 return LowerToScalableOp(Op, DAG);
12691
12692 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12693 "Unexpected vector type!");
12694
12695 // We can handle the constant cases during isel.
12696 if (isa<ConstantSDNode>(Op.getOperand(0)))
12697 return Op;
12698
12699 // There isn't a natural way to handle the general i1 case, so we use some
12700 // trickery with whilelo.
12701 SDLoc DL(Op);
12702 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12703 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12704 DAG.getValueType(MVT::i1));
12705 SDValue ID =
12706 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12707 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12708 if (VT == MVT::nxv1i1)
12709 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12710 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12711 Zero, SplatVal),
12712 Zero);
12713 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12714}
12715
12716SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12717 SelectionDAG &DAG) const {
12718 SDLoc DL(Op);
12719
12720 EVT VT = Op.getValueType();
12721 if (!isTypeLegal(VT) || !VT.isScalableVector())
12722 return SDValue();
12723
12724 // Current lowering only supports the SVE-ACLE types.
12726 return SDValue();
12727
12728 // The DUPQ operation is indepedent of element type so normalise to i64s.
12729 SDValue Idx128 = Op.getOperand(2);
12730
12731 // DUPQ can be used when idx is in range.
12732 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12733 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12734 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12735 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12736 }
12737
12738 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12739
12740 // The ACLE says this must produce the same result as:
12741 // svtbl(data, svadd_x(svptrue_b64(),
12742 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12743 // index * 2))
12744 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12745 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12746
12747 // create the vector 0,1,0,1,...
12748 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12749 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12750
12751 // create the vector idx64,idx64+1,idx64,idx64+1,...
12752 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12753 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12754 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12755
12756 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12757 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12758 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12759}
12760
12761
12762static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12763 APInt &UndefBits) {
12764 EVT VT = BVN->getValueType(0);
12765 APInt SplatBits, SplatUndef;
12766 unsigned SplatBitSize;
12767 bool HasAnyUndefs;
12768 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12769 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12770
12771 for (unsigned i = 0; i < NumSplats; ++i) {
12772 CnstBits <<= SplatBitSize;
12773 UndefBits <<= SplatBitSize;
12774 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12775 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12776 }
12777
12778 return true;
12779 }
12780
12781 return false;
12782}
12783
12784// Try 64-bit splatted SIMD immediate.
12785static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12786 const APInt &Bits) {
12787 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12788 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12789 EVT VT = Op.getValueType();
12790 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12791
12794
12795 SDLoc dl(Op);
12796 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12797 DAG.getConstant(Value, dl, MVT::i32));
12798 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12799 }
12800 }
12801
12802 return SDValue();
12803}
12804
12805// Try 32-bit splatted SIMD immediate.
12806static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12807 const APInt &Bits,
12808 const SDValue *LHS = nullptr) {
12809 EVT VT = Op.getValueType();
12810 if (VT.isFixedLengthVector() &&
12812 return SDValue();
12813
12814 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12815 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12816 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12817 bool isAdvSIMDModImm = false;
12818 uint64_t Shift;
12819
12820 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12822 Shift = 0;
12823 }
12824 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12826 Shift = 8;
12827 }
12828 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12830 Shift = 16;
12831 }
12832 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12834 Shift = 24;
12835 }
12836
12837 if (isAdvSIMDModImm) {
12838 SDLoc dl(Op);
12839 SDValue Mov;
12840
12841 if (LHS)
12842 Mov = DAG.getNode(NewOp, dl, MovTy,
12843 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12844 DAG.getConstant(Value, dl, MVT::i32),
12845 DAG.getConstant(Shift, dl, MVT::i32));
12846 else
12847 Mov = DAG.getNode(NewOp, dl, MovTy,
12848 DAG.getConstant(Value, dl, MVT::i32),
12849 DAG.getConstant(Shift, dl, MVT::i32));
12850
12851 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12852 }
12853 }
12854
12855 return SDValue();
12856}
12857
12858// Try 16-bit splatted SIMD immediate.
12859static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12860 const APInt &Bits,
12861 const SDValue *LHS = nullptr) {
12862 EVT VT = Op.getValueType();
12863 if (VT.isFixedLengthVector() &&
12865 return SDValue();
12866
12867 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12868 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12869 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12870 bool isAdvSIMDModImm = false;
12871 uint64_t Shift;
12872
12873 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12875 Shift = 0;
12876 }
12877 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12879 Shift = 8;
12880 }
12881
12882 if (isAdvSIMDModImm) {
12883 SDLoc dl(Op);
12884 SDValue Mov;
12885
12886 if (LHS)
12887 Mov = DAG.getNode(NewOp, dl, MovTy,
12888 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12889 DAG.getConstant(Value, dl, MVT::i32),
12890 DAG.getConstant(Shift, dl, MVT::i32));
12891 else
12892 Mov = DAG.getNode(NewOp, dl, MovTy,
12893 DAG.getConstant(Value, dl, MVT::i32),
12894 DAG.getConstant(Shift, dl, MVT::i32));
12895
12896 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12897 }
12898 }
12899
12900 return SDValue();
12901}
12902
12903// Try 32-bit splatted SIMD immediate with shifted ones.
12905 SelectionDAG &DAG, const APInt &Bits) {
12906 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12907 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12908 EVT VT = Op.getValueType();
12909 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12910 bool isAdvSIMDModImm = false;
12911 uint64_t Shift;
12912
12913 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12915 Shift = 264;
12916 }
12917 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12919 Shift = 272;
12920 }
12921
12922 if (isAdvSIMDModImm) {
12923 SDLoc dl(Op);
12924 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12925 DAG.getConstant(Value, dl, MVT::i32),
12926 DAG.getConstant(Shift, dl, MVT::i32));
12927 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12928 }
12929 }
12930
12931 return SDValue();
12932}
12933
12934// Try 8-bit splatted SIMD immediate.
12935static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12936 const APInt &Bits) {
12937 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12938 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12939 EVT VT = Op.getValueType();
12940 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12941
12944
12945 SDLoc dl(Op);
12946 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12947 DAG.getConstant(Value, dl, MVT::i32));
12948 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12949 }
12950 }
12951
12952 return SDValue();
12953}
12954
12955// Try FP splatted SIMD immediate.
12956static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12957 const APInt &Bits) {
12958 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12959 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12960 EVT VT = Op.getValueType();
12961 bool isWide = (VT.getSizeInBits() == 128);
12962 MVT MovTy;
12963 bool isAdvSIMDModImm = false;
12964
12965 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12967 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12968 }
12969 else if (isWide &&
12970 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12972 MovTy = MVT::v2f64;
12973 }
12974
12975 if (isAdvSIMDModImm) {
12976 SDLoc dl(Op);
12977 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12978 DAG.getConstant(Value, dl, MVT::i32));
12979 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12980 }
12981 }
12982
12983 return SDValue();
12984}
12985
12986// Specialized code to quickly find if PotentialBVec is a BuildVector that
12987// consists of only the same constant int value, returned in reference arg
12988// ConstVal
12989static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12990 uint64_t &ConstVal) {
12991 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
12992 if (!Bvec)
12993 return false;
12994 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
12995 if (!FirstElt)
12996 return false;
12997 EVT VT = Bvec->getValueType(0);
12998 unsigned NumElts = VT.getVectorNumElements();
12999 for (unsigned i = 1; i < NumElts; ++i)
13000 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13001 return false;
13002 ConstVal = FirstElt->getZExtValue();
13003 return true;
13004}
13005
13007 // Look through cast.
13008 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13009 N = N.getOperand(0);
13010
13011 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13012}
13013
13015 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13016
13017 // Look through cast.
13018 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13019 N = N.getOperand(0);
13020 // When reinterpreting from a type with fewer elements the "new" elements
13021 // are not active, so bail if they're likely to be used.
13022 if (N.getValueType().getVectorMinNumElements() < NumElts)
13023 return false;
13024 }
13025
13026 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13027 return true;
13028
13029 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13030 // or smaller than the implicit element type represented by N.
13031 // NOTE: A larger element count implies a smaller element type.
13032 if (N.getOpcode() == AArch64ISD::PTRUE &&
13033 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13034 return N.getValueType().getVectorMinNumElements() >= NumElts;
13035
13036 // If we're compiling for a specific vector-length, we can check if the
13037 // pattern's VL equals that of the scalable vector at runtime.
13038 if (N.getOpcode() == AArch64ISD::PTRUE) {
13039 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13040 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13041 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13042 if (MaxSVESize && MinSVESize == MaxSVESize) {
13043 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13044 unsigned PatNumElts =
13045 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13046 return PatNumElts == (NumElts * VScale);
13047 }
13048 }
13049
13050 return false;
13051}
13052
13053// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13054// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13055// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13056// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13057// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13058// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13060 EVT VT = N->getValueType(0);
13061
13062 if (!VT.isVector())
13063 return SDValue();
13064
13065 SDLoc DL(N);
13066
13067 SDValue And;
13068 SDValue Shift;
13069
13070 SDValue FirstOp = N->getOperand(0);
13071 unsigned FirstOpc = FirstOp.getOpcode();
13072 SDValue SecondOp = N->getOperand(1);
13073 unsigned SecondOpc = SecondOp.getOpcode();
13074
13075 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13076 // a BICi in order to use an immediate instead of a register.
13077 // Is the other operand an shl or lshr? This will have been turned into:
13078 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13079 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13080 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13081 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13082 SecondOpc == AArch64ISD::SHL_PRED ||
13083 SecondOpc == AArch64ISD::SRL_PRED)) {
13084 And = FirstOp;
13085 Shift = SecondOp;
13086
13087 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13088 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13089 FirstOpc == AArch64ISD::SHL_PRED ||
13090 FirstOpc == AArch64ISD::SRL_PRED)) {
13091 And = SecondOp;
13092 Shift = FirstOp;
13093 } else
13094 return SDValue();
13095
13096 bool IsAnd = And.getOpcode() == ISD::AND;
13097 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13099 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13101
13102 // Is the shift amount constant and are all lanes active?
13103 uint64_t C2;
13104 if (ShiftHasPredOp) {
13105 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13106 return SDValue();
13107 APInt C;
13109 return SDValue();
13110 C2 = C.getZExtValue();
13111 } else if (ConstantSDNode *C2node =
13112 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13113 C2 = C2node->getZExtValue();
13114 else
13115 return SDValue();
13116
13117 APInt C1AsAPInt;
13118 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13119 if (IsAnd) {
13120 // Is the and mask vector all constant?
13121 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13122 return SDValue();
13123 } else {
13124 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13125 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13126 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13127 assert(C1nodeImm && C1nodeShift);
13128 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13129 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13130 }
13131
13132 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13133 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13134 // how much one can shift elements of a particular size?
13135 if (C2 > ElemSizeInBits)
13136 return SDValue();
13137
13138 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13139 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13140 if (C1AsAPInt != RequiredC1)
13141 return SDValue();
13142
13143 SDValue X = And.getOperand(0);
13144 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13145 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13146 : Shift.getOperand(1);
13147
13148 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13149 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13150
13151 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13152 LLVM_DEBUG(N->dump(&DAG));
13153 LLVM_DEBUG(dbgs() << "into: \n");
13154 LLVM_DEBUG(ResultSLI->dump(&DAG));
13155
13156 ++NumShiftInserts;
13157 return ResultSLI;
13158}
13159
13160SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13161 SelectionDAG &DAG) const {
13162 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13163 !Subtarget->isNeonAvailable()))
13164 return LowerToScalableOp(Op, DAG);
13165
13166 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13167 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13168 return Res;
13169
13170 EVT VT = Op.getValueType();
13171 if (VT.isScalableVector())
13172 return Op;
13173
13174 SDValue LHS = Op.getOperand(0);
13175 BuildVectorSDNode *BVN =
13176 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13177 if (!BVN) {
13178 // OR commutes, so try swapping the operands.
13179 LHS = Op.getOperand(1);
13180 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13181 }
13182 if (!BVN)
13183 return Op;
13184
13185 APInt DefBits(VT.getSizeInBits(), 0);
13186 APInt UndefBits(VT.getSizeInBits(), 0);
13187 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13188 SDValue NewOp;
13189
13190 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13191 DefBits, &LHS)) ||
13192 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13193 DefBits, &LHS)))
13194 return NewOp;
13195
13196 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13197 UndefBits, &LHS)) ||
13198 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13199 UndefBits, &LHS)))
13200 return NewOp;
13201 }
13202
13203 // We can always fall back to a non-immediate OR.
13204 return Op;
13205}
13206
13207// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13208// be truncated to fit element width.
13210 SelectionDAG &DAG) {
13211 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13212 SDLoc dl(Op);
13213 EVT VT = Op.getValueType();
13214 EVT EltTy= VT.getVectorElementType();
13215
13216 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13217 return Op;
13218
13220 for (SDValue Lane : Op->ops()) {
13221 // For integer vectors, type legalization would have promoted the
13222 // operands already. Otherwise, if Op is a floating-point splat
13223 // (with operands cast to integers), then the only possibilities
13224 // are constants and UNDEFs.
13225 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13226 APInt LowBits(EltTy.getSizeInBits(),
13227 CstLane->getZExtValue());
13228 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13229 } else if (Lane.getNode()->isUndef()) {
13230 Lane = DAG.getUNDEF(MVT::i32);
13231 } else {
13232 assert(Lane.getValueType() == MVT::i32 &&
13233 "Unexpected BUILD_VECTOR operand type");
13234 }
13235 Ops.push_back(Lane);
13236 }
13237 return DAG.getBuildVector(VT, dl, Ops);
13238}
13239
13241 const AArch64Subtarget *ST) {
13242 EVT VT = Op.getValueType();
13243 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13244 "Expected a legal NEON vector");
13245
13246 APInt DefBits(VT.getSizeInBits(), 0);
13247 APInt UndefBits(VT.getSizeInBits(), 0);
13248 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13249 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13250 auto TryMOVIWithBits = [&](APInt DefBits) {
13251 SDValue NewOp;
13252 if ((NewOp =
13253 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13254 (NewOp =
13255 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13256 (NewOp =
13257 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13258 (NewOp =
13259 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13260 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13261 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13262 return NewOp;
13263
13264 APInt NotDefBits = ~DefBits;
13265 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13266 NotDefBits)) ||
13268 NotDefBits)) ||
13269 (NewOp =
13270 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13271 return NewOp;
13272 return SDValue();
13273 };
13274 if (SDValue R = TryMOVIWithBits(DefBits))
13275 return R;
13276 if (SDValue R = TryMOVIWithBits(UndefBits))
13277 return R;
13278
13279 // See if a fneg of the constant can be materialized with a MOVI, etc
13280 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13281 // FNegate each sub-element of the constant
13282 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13283 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13284 .zext(VT.getSizeInBits());
13285 APInt NegBits(VT.getSizeInBits(), 0);
13286 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13287 for (unsigned i = 0; i < NumElts; i++)
13288 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13289 NegBits = DefBits ^ NegBits;
13290
13291 // Try to create the new constants with MOVI, and if so generate a fneg
13292 // for it.
13293 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13294 SDLoc DL(Op);
13295 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13296 return DAG.getNode(
13298 DAG.getNode(ISD::FNEG, DL, VFVT,
13299 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13300 }
13301 return SDValue();
13302 };
13303 SDValue R;
13304 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13305 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13306 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13307 return R;
13308 }
13309
13310 return SDValue();
13311}
13312
13313SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13314 SelectionDAG &DAG) const {
13315 EVT VT = Op.getValueType();
13316
13317 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13318 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13319 SDLoc DL(Op);
13320 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13321 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13322 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13323 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13324 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13325 }
13326
13327 // Revert to common legalisation for all other variants.
13328 return SDValue();
13329 }
13330
13331 // Try to build a simple constant vector.
13332 Op = NormalizeBuildVector(Op, DAG);
13333 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13334 // abort.
13335 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13336 return SDValue();
13337
13338 // Certain vector constants, used to express things like logical NOT and
13339 // arithmetic NEG, are passed through unmodified. This allows special
13340 // patterns for these operations to match, which will lower these constants
13341 // to whatever is proven necessary.
13342 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13343 if (BVN->isConstant()) {
13344 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13345 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13346 APInt Val(BitSize,
13347 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13348 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13349 return Op;
13350 }
13351 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13352 if (Const->isZero() && !Const->isNegative())
13353 return Op;
13354 }
13355
13356 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13357 return V;
13358
13359 // Scan through the operands to find some interesting properties we can
13360 // exploit:
13361 // 1) If only one value is used, we can use a DUP, or
13362 // 2) if only the low element is not undef, we can just insert that, or
13363 // 3) if only one constant value is used (w/ some non-constant lanes),
13364 // we can splat the constant value into the whole vector then fill
13365 // in the non-constant lanes.
13366 // 4) FIXME: If different constant values are used, but we can intelligently
13367 // select the values we'll be overwriting for the non-constant
13368 // lanes such that we can directly materialize the vector
13369 // some other way (MOVI, e.g.), we can be sneaky.
13370 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13371 SDLoc dl(Op);
13372 unsigned NumElts = VT.getVectorNumElements();
13373 bool isOnlyLowElement = true;
13374 bool usesOnlyOneValue = true;
13375 bool usesOnlyOneConstantValue = true;
13376 bool isConstant = true;
13377 bool AllLanesExtractElt = true;
13378 unsigned NumConstantLanes = 0;
13379 unsigned NumDifferentLanes = 0;
13380 unsigned NumUndefLanes = 0;
13381 SDValue Value;
13382 SDValue ConstantValue;
13383 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13384 unsigned ConsecutiveValCount = 0;
13385 SDValue PrevVal;
13386 for (unsigned i = 0; i < NumElts; ++i) {
13387 SDValue V = Op.getOperand(i);
13388 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13389 AllLanesExtractElt = false;
13390 if (V.isUndef()) {
13391 ++NumUndefLanes;
13392 continue;
13393 }
13394 if (i > 0)
13395 isOnlyLowElement = false;
13396 if (!isIntOrFPConstant(V))
13397 isConstant = false;
13398
13399 if (isIntOrFPConstant(V)) {
13400 ++NumConstantLanes;
13401 if (!ConstantValue.getNode())
13402 ConstantValue = V;
13403 else if (ConstantValue != V)
13404 usesOnlyOneConstantValue = false;
13405 }
13406
13407 if (!Value.getNode())
13408 Value = V;
13409 else if (V != Value) {
13410 usesOnlyOneValue = false;
13411 ++NumDifferentLanes;
13412 }
13413
13414 if (PrevVal != V) {
13415 ConsecutiveValCount = 0;
13416 PrevVal = V;
13417 }
13418
13419 // Keep different values and its last consecutive count. For example,
13420 //
13421 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13422 // t24, t24, t24, t24, t24, t24, t24, t24
13423 // t23 = consecutive count 8
13424 // t24 = consecutive count 8
13425 // ------------------------------------------------------------------
13426 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13427 // t24, t24, t24, t24, t24, t24, t24, t24
13428 // t23 = consecutive count 5
13429 // t24 = consecutive count 9
13430 DifferentValueMap[V] = ++ConsecutiveValCount;
13431 }
13432
13433 if (!Value.getNode()) {
13434 LLVM_DEBUG(
13435 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13436 return DAG.getUNDEF(VT);
13437 }
13438
13439 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13440 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13441 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13442 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13443 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13444 "SCALAR_TO_VECTOR node\n");
13445 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13446 }
13447
13448 if (AllLanesExtractElt) {
13449 SDNode *Vector = nullptr;
13450 bool Even = false;
13451 bool Odd = false;
13452 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13453 // the Odd pattern <1,3,5,...>.
13454 for (unsigned i = 0; i < NumElts; ++i) {
13455 SDValue V = Op.getOperand(i);
13456 const SDNode *N = V.getNode();
13457 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13458 Even = false;
13459 Odd = false;
13460 break;
13461 }
13462 SDValue N0 = N->getOperand(0);
13463
13464 // All elements are extracted from the same vector.
13465 if (!Vector) {
13466 Vector = N0.getNode();
13467 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13468 // BUILD_VECTOR.
13469 if (VT.getVectorElementType() !=
13471 break;
13472 } else if (Vector != N0.getNode()) {
13473 Odd = false;
13474 Even = false;
13475 break;
13476 }
13477
13478 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13479 // indices <1,3,5,...>.
13480 uint64_t Val = N->getConstantOperandVal(1);
13481 if (Val == 2 * i) {
13482 Even = true;
13483 continue;
13484 }
13485 if (Val - 1 == 2 * i) {
13486 Odd = true;
13487 continue;
13488 }
13489
13490 // Something does not match: abort.
13491 Odd = false;
13492 Even = false;
13493 break;
13494 }
13495 if (Even || Odd) {
13496 SDValue LHS =
13498 DAG.getConstant(0, dl, MVT::i64));
13499 SDValue RHS =
13501 DAG.getConstant(NumElts, dl, MVT::i64));
13502
13503 if (Even && !Odd)
13504 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13505 RHS);
13506 if (Odd && !Even)
13507 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13508 RHS);
13509 }
13510 }
13511
13512 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13513 // i32 and try again.
13514 if (usesOnlyOneValue) {
13515 if (!isConstant) {
13516 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13517 Value.getValueType() != VT) {
13518 LLVM_DEBUG(
13519 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13520 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13521 }
13522
13523 // This is actually a DUPLANExx operation, which keeps everything vectory.
13524
13525 SDValue Lane = Value.getOperand(1);
13526 Value = Value.getOperand(0);
13527 if (Value.getValueSizeInBits() == 64) {
13528 LLVM_DEBUG(
13529 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13530 "widening it\n");
13531 Value = WidenVector(Value, DAG);
13532 }
13533
13534 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13535 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13536 }
13537
13540 EVT EltTy = VT.getVectorElementType();
13541 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13542 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13543 LLVM_DEBUG(
13544 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13545 "BITCASTS, and try again\n");
13546 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13547 for (unsigned i = 0; i < NumElts; ++i)
13548 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13549 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13550 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13551 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13552 Val.dump(););
13553 Val = LowerBUILD_VECTOR(Val, DAG);
13554 if (Val.getNode())
13555 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13556 }
13557 }
13558
13559 // If we need to insert a small number of different non-constant elements and
13560 // the vector width is sufficiently large, prefer using DUP with the common
13561 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13562 // skip the constant lane handling below.
13563 bool PreferDUPAndInsert =
13564 !isConstant && NumDifferentLanes >= 1 &&
13565 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13566 NumDifferentLanes >= NumConstantLanes;
13567
13568 // If there was only one constant value used and for more than one lane,
13569 // start by splatting that value, then replace the non-constant lanes. This
13570 // is better than the default, which will perform a separate initialization
13571 // for each lane.
13572 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13573 // Firstly, try to materialize the splat constant.
13574 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13575 unsigned BitSize = VT.getScalarSizeInBits();
13576 APInt ConstantValueAPInt(1, 0);
13577 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13578 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13579 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13580 !ConstantValueAPInt.isAllOnes()) {
13581 Val = ConstantBuildVector(Val, DAG, Subtarget);
13582 if (!Val)
13583 // Otherwise, materialize the constant and splat it.
13584 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13585 }
13586
13587 // Now insert the non-constant lanes.
13588 for (unsigned i = 0; i < NumElts; ++i) {
13589 SDValue V = Op.getOperand(i);
13590 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13591 if (!isIntOrFPConstant(V))
13592 // Note that type legalization likely mucked about with the VT of the
13593 // source operand, so we may have to convert it here before inserting.
13594 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13595 }
13596 return Val;
13597 }
13598
13599 // This will generate a load from the constant pool.
13600 if (isConstant) {
13601 LLVM_DEBUG(
13602 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13603 "expansion\n");
13604 return SDValue();
13605 }
13606
13607 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13608 // v4i32s. This is really a truncate, which we can construct out of (legal)
13609 // concats and truncate nodes.
13611 return M;
13612
13613 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13614 if (NumElts >= 4) {
13615 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13616 return Shuffle;
13617
13618 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13619 return Shuffle;
13620 }
13621
13622 if (PreferDUPAndInsert) {
13623 // First, build a constant vector with the common element.
13624 SmallVector<SDValue, 8> Ops(NumElts, Value);
13625 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13626 // Next, insert the elements that do not match the common value.
13627 for (unsigned I = 0; I < NumElts; ++I)
13628 if (Op.getOperand(I) != Value)
13629 NewVector =
13630 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13631 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13632
13633 return NewVector;
13634 }
13635
13636 // If vector consists of two different values, try to generate two DUPs and
13637 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13638 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13640 // Check the consecutive count of the value is the half number of vector
13641 // elements. In this case, we can use CONCAT_VECTORS. For example,
13642 //
13643 // canUseVECTOR_CONCAT = true;
13644 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13645 // t24, t24, t24, t24, t24, t24, t24, t24
13646 //
13647 // canUseVECTOR_CONCAT = false;
13648 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13649 // t24, t24, t24, t24, t24, t24, t24, t24
13650 bool canUseVECTOR_CONCAT = true;
13651 for (auto Pair : DifferentValueMap) {
13652 // Check different values have same length which is NumElts / 2.
13653 if (Pair.second != NumElts / 2)
13654 canUseVECTOR_CONCAT = false;
13655 Vals.push_back(Pair.first);
13656 }
13657
13658 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13659 // CONCAT_VECTORs. For example,
13660 //
13661 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13662 // t24, t24, t24, t24, t24, t24, t24, t24
13663 // ==>
13664 // t26: v8i8 = AArch64ISD::DUP t23
13665 // t28: v8i8 = AArch64ISD::DUP t24
13666 // t29: v16i8 = concat_vectors t26, t28
13667 if (canUseVECTOR_CONCAT) {
13668 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13669 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13670 SubVT.getVectorNumElements() >= 2) {
13671 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13672 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13673 SDValue DUP1 =
13674 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13675 SDValue DUP2 =
13676 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13678 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13679 return CONCAT_VECTORS;
13680 }
13681 }
13682
13683 // Let's try to generate VECTOR_SHUFFLE. For example,
13684 //
13685 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13686 // ==>
13687 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13688 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13689 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13690 if (NumElts >= 8) {
13691 SmallVector<int, 16> MaskVec;
13692 // Build mask for VECTOR_SHUFLLE.
13693 SDValue FirstLaneVal = Op.getOperand(0);
13694 for (unsigned i = 0; i < NumElts; ++i) {
13695 SDValue Val = Op.getOperand(i);
13696 if (FirstLaneVal == Val)
13697 MaskVec.push_back(i);
13698 else
13699 MaskVec.push_back(i + NumElts);
13700 }
13701
13702 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13703 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13704 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13705 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13707 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13708 return VECTOR_SHUFFLE;
13709 }
13710 }
13711
13712 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13713 // know the default expansion would otherwise fall back on something even
13714 // worse. For a vector with one or two non-undef values, that's
13715 // scalar_to_vector for the elements followed by a shuffle (provided the
13716 // shuffle is valid for the target) and materialization element by element
13717 // on the stack followed by a load for everything else.
13718 if (!isConstant && !usesOnlyOneValue) {
13719 LLVM_DEBUG(
13720 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13721 "of INSERT_VECTOR_ELT\n");
13722
13723 SDValue Vec = DAG.getUNDEF(VT);
13724 SDValue Op0 = Op.getOperand(0);
13725 unsigned i = 0;
13726
13727 // Use SCALAR_TO_VECTOR for lane zero to
13728 // a) Avoid a RMW dependency on the full vector register, and
13729 // b) Allow the register coalescer to fold away the copy if the
13730 // value is already in an S or D register, and we're forced to emit an
13731 // INSERT_SUBREG that we can't fold anywhere.
13732 //
13733 // We also allow types like i8 and i16 which are illegal scalar but legal
13734 // vector element types. After type-legalization the inserted value is
13735 // extended (i32) and it is safe to cast them to the vector type by ignoring
13736 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13737 if (!Op0.isUndef()) {
13738 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13739 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13740 ++i;
13741 }
13742 LLVM_DEBUG(if (i < NumElts) dbgs()
13743 << "Creating nodes for the other vector elements:\n";);
13744 for (; i < NumElts; ++i) {
13745 SDValue V = Op.getOperand(i);
13746 if (V.isUndef())
13747 continue;
13748 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13749 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13750 }
13751 return Vec;
13752 }
13753
13754 LLVM_DEBUG(
13755 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13756 "better alternative\n");
13757 return SDValue();
13758}
13759
13760SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13761 SelectionDAG &DAG) const {
13762 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13763 !Subtarget->isNeonAvailable()))
13764 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13765
13766 assert(Op.getValueType().isScalableVector() &&
13767 isTypeLegal(Op.getValueType()) &&
13768 "Expected legal scalable vector type!");
13769
13770 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13771 unsigned NumOperands = Op->getNumOperands();
13772 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13773 "Unexpected number of operands in CONCAT_VECTORS");
13774
13775 if (NumOperands == 2)
13776 return Op;
13777
13778 // Concat each pair of subvectors and pack into the lower half of the array.
13779 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13780 while (ConcatOps.size() > 1) {
13781 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13782 SDValue V1 = ConcatOps[I];
13783 SDValue V2 = ConcatOps[I + 1];
13784 EVT SubVT = V1.getValueType();
13785 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13786 ConcatOps[I / 2] =
13787 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13788 }
13789 ConcatOps.resize(ConcatOps.size() / 2);
13790 }
13791 return ConcatOps[0];
13792 }
13793
13794 return SDValue();
13795}
13796
13797SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13798 SelectionDAG &DAG) const {
13799 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13800
13801 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13802 !Subtarget->isNeonAvailable()))
13803 return LowerFixedLengthInsertVectorElt(Op, DAG);
13804
13805 EVT VT = Op.getOperand(0).getValueType();
13806
13807 if (VT.getScalarType() == MVT::i1) {
13808 EVT VectorVT = getPromotedVTForPredicate(VT);
13809 SDLoc DL(Op);
13810 SDValue ExtendedVector =
13811 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13812 SDValue ExtendedValue =
13813 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13814 VectorVT.getScalarType().getSizeInBits() < 32
13815 ? MVT::i32
13816 : VectorVT.getScalarType());
13817 ExtendedVector =
13818 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13819 ExtendedValue, Op.getOperand(2));
13820 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13821 }
13822
13823 // Check for non-constant or out of range lane.
13824 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13825 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13826 return SDValue();
13827
13828 return Op;
13829}
13830
13831SDValue
13832AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13833 SelectionDAG &DAG) const {
13834 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13835 EVT VT = Op.getOperand(0).getValueType();
13836
13837 if (VT.getScalarType() == MVT::i1) {
13838 // We can't directly extract from an SVE predicate; extend it first.
13839 // (This isn't the only possible lowering, but it's straightforward.)
13840 EVT VectorVT = getPromotedVTForPredicate(VT);
13841 SDLoc DL(Op);
13842 SDValue Extend =
13843 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13844 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13845 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13846 Extend, Op.getOperand(1));
13847 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13848 }
13849
13850 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13851 return LowerFixedLengthExtractVectorElt(Op, DAG);
13852
13853 // Check for non-constant or out of range lane.
13854 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13855 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13856 return SDValue();
13857
13858 // Insertion/extraction are legal for V128 types.
13859 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13860 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13861 VT == MVT::v8f16 || VT == MVT::v8bf16)
13862 return Op;
13863
13864 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13865 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13866 VT != MVT::v4bf16)
13867 return SDValue();
13868
13869 // For V64 types, we perform extraction by expanding the value
13870 // to a V128 type and perform the extraction on that.
13871 SDLoc DL(Op);
13872 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13873 EVT WideTy = WideVec.getValueType();
13874
13875 EVT ExtrTy = WideTy.getVectorElementType();
13876 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13877 ExtrTy = MVT::i32;
13878
13879 // For extractions, we just return the result directly.
13880 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13881 Op.getOperand(1));
13882}
13883
13884SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13885 SelectionDAG &DAG) const {
13886 assert(Op.getValueType().isFixedLengthVector() &&
13887 "Only cases that extract a fixed length vector are supported!");
13888
13889 EVT InVT = Op.getOperand(0).getValueType();
13890 unsigned Idx = Op.getConstantOperandVal(1);
13891 unsigned Size = Op.getValueSizeInBits();
13892
13893 // If we don't have legal types yet, do nothing
13894 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13895 return SDValue();
13896
13897 if (InVT.isScalableVector()) {
13898 // This will be matched by custom code during ISelDAGToDAG.
13899 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13900 return Op;
13901
13902 return SDValue();
13903 }
13904
13905 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13906 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13907 return Op;
13908
13909 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13910 // that directly.
13911 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13912 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13913 return Op;
13914
13915 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13916 SDLoc DL(Op);
13917
13918 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13919 SDValue NewInVec =
13920 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13921
13922 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13923 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13924 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13925 }
13926
13927 return SDValue();
13928}
13929
13930SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13931 SelectionDAG &DAG) const {
13932 assert(Op.getValueType().isScalableVector() &&
13933 "Only expect to lower inserts into scalable vectors!");
13934
13935 EVT InVT = Op.getOperand(1).getValueType();
13936 unsigned Idx = Op.getConstantOperandVal(2);
13937
13938 SDValue Vec0 = Op.getOperand(0);
13939 SDValue Vec1 = Op.getOperand(1);
13940 SDLoc DL(Op);
13941 EVT VT = Op.getValueType();
13942
13943 if (InVT.isScalableVector()) {
13944 if (!isTypeLegal(VT))
13945 return SDValue();
13946
13947 // Break down insert_subvector into simpler parts.
13948 if (VT.getVectorElementType() == MVT::i1) {
13949 unsigned NumElts = VT.getVectorMinNumElements();
13950 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13951
13952 SDValue Lo, Hi;
13953 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13954 DAG.getVectorIdxConstant(0, DL));
13955 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13956 DAG.getVectorIdxConstant(NumElts / 2, DL));
13957 if (Idx < (NumElts / 2)) {
13958 SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13960 return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
13961 } else {
13962 SDValue NewHi =
13963 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13964 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13965 return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
13966 }
13967 }
13968
13969 // Ensure the subvector is half the size of the main vector.
13970 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13971 return SDValue();
13972
13973 // Here narrow and wide refers to the vector element types. After "casting"
13974 // both vectors must have the same bit length and so because the subvector
13975 // has fewer elements, those elements need to be bigger.
13978
13979 // NOP cast operands to the largest legal vector of the same element count.
13980 if (VT.isFloatingPoint()) {
13981 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13982 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13983 } else {
13984 // Legal integer vectors are already their largest so Vec0 is fine as is.
13985 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13986 }
13987
13988 // To replace the top/bottom half of vector V with vector SubV we widen the
13989 // preserved half of V, concatenate this to SubV (the order depending on the
13990 // half being replaced) and then narrow the result.
13991 SDValue Narrow;
13992 if (Idx == 0) {
13993 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
13994 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
13995 } else {
13997 "Invalid subvector index!");
13998 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
13999 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14000 }
14001
14002 return getSVESafeBitCast(VT, Narrow, DAG);
14003 }
14004
14005 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14006 // This will be matched by custom code during ISelDAGToDAG.
14007 if (Vec0.isUndef())
14008 return Op;
14009
14010 std::optional<unsigned> PredPattern =
14012 auto PredTy = VT.changeVectorElementType(MVT::i1);
14013 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14014 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14015 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14016 }
14017
14018 return SDValue();
14019}
14020
14021static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14022 if (Op.getOpcode() != AArch64ISD::DUP &&
14023 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14024 Op.getOpcode() != ISD::BUILD_VECTOR)
14025 return false;
14026
14027 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14028 !isAllConstantBuildVector(Op, SplatVal))
14029 return false;
14030
14031 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14032 !isa<ConstantSDNode>(Op->getOperand(0)))
14033 return false;
14034
14035 SplatVal = Op->getConstantOperandVal(0);
14036 if (Op.getValueType().getVectorElementType() != MVT::i64)
14037 SplatVal = (int32_t)SplatVal;
14038
14039 Negated = false;
14040 if (isPowerOf2_64(SplatVal))
14041 return true;
14042
14043 Negated = true;
14044 if (isPowerOf2_64(-SplatVal)) {
14045 SplatVal = -SplatVal;
14046 return true;
14047 }
14048
14049 return false;
14050}
14051
14052SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14053 EVT VT = Op.getValueType();
14054 SDLoc dl(Op);
14055
14056 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14057 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14058
14059 assert(VT.isScalableVector() && "Expected a scalable vector.");
14060
14061 bool Signed = Op.getOpcode() == ISD::SDIV;
14062 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14063
14064 bool Negated;
14065 uint64_t SplatVal;
14066 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14067 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14068 SDValue Res =
14069 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14070 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14071 if (Negated)
14072 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14073
14074 return Res;
14075 }
14076
14077 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14078 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14079
14080 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14081 // operations, and truncate the result.
14082 EVT WidenedVT;
14083 if (VT == MVT::nxv16i8)
14084 WidenedVT = MVT::nxv8i16;
14085 else if (VT == MVT::nxv8i16)
14086 WidenedVT = MVT::nxv4i32;
14087 else
14088 llvm_unreachable("Unexpected Custom DIV operation");
14089
14090 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14091 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14092 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14093 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14094 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14095 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14096 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14097 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14098 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14099}
14100
14102 // Currently no fixed length shuffles that require SVE are legal.
14103 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14104 return false;
14105
14106 if (VT.getVectorNumElements() == 4 &&
14107 (VT.is128BitVector() || VT.is64BitVector())) {
14108 unsigned Cost = getPerfectShuffleCost(M);
14109 if (Cost <= 1)
14110 return true;
14111 }
14112
14113 bool DummyBool;
14114 int DummyInt;
14115 unsigned DummyUnsigned;
14116
14117 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
14118 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
14119 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14120 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
14121 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
14122 isZIPMask(M, VT, DummyUnsigned) ||
14123 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14124 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14125 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14126 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
14127 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14128}
14129
14131 EVT VT) const {
14132 // Just delegate to the generic legality, clear masks aren't special.
14133 return isShuffleMaskLegal(M, VT);
14134}
14135
14136/// getVShiftImm - Check if this is a valid build_vector for the immediate
14137/// operand of a vector shift operation, where all the elements of the
14138/// build_vector must have the same constant integer value.
14139static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14140 // Ignore bit_converts.
14141 while (Op.getOpcode() == ISD::BITCAST)
14142 Op = Op.getOperand(0);
14143 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14144 APInt SplatBits, SplatUndef;
14145 unsigned SplatBitSize;
14146 bool HasAnyUndefs;
14147 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14148 HasAnyUndefs, ElementBits) ||
14149 SplatBitSize > ElementBits)
14150 return false;
14151 Cnt = SplatBits.getSExtValue();
14152 return true;
14153}
14154
14155/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14156/// operand of a vector shift left operation. That value must be in the range:
14157/// 0 <= Value < ElementBits for a left shift; or
14158/// 0 <= Value <= ElementBits for a long left shift.
14159static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14160 assert(VT.isVector() && "vector shift count is not a vector type");
14161 int64_t ElementBits = VT.getScalarSizeInBits();
14162 if (!getVShiftImm(Op, ElementBits, Cnt))
14163 return false;
14164 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14165}
14166
14167/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14168/// operand of a vector shift right operation. The value must be in the range:
14169/// 1 <= Value <= ElementBits for a right shift; or
14170static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14171 assert(VT.isVector() && "vector shift count is not a vector type");
14172 int64_t ElementBits = VT.getScalarSizeInBits();
14173 if (!getVShiftImm(Op, ElementBits, Cnt))
14174 return false;
14175 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14176}
14177
14178SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14179 SelectionDAG &DAG) const {
14180 EVT VT = Op.getValueType();
14181
14182 if (VT.getScalarType() == MVT::i1) {
14183 // Lower i1 truncate to `(x & 1) != 0`.
14184 SDLoc dl(Op);
14185 EVT OpVT = Op.getOperand(0).getValueType();
14186 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14187 SDValue One = DAG.getConstant(1, dl, OpVT);
14188 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14189 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14190 }
14191
14192 if (!VT.isVector() || VT.isScalableVector())
14193 return SDValue();
14194
14195 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14196 !Subtarget->isNeonAvailable()))
14197 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14198
14199 return SDValue();
14200}
14201
14202// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14203// possibly a truncated type, it tells how many bits of the value are to be
14204// used.
14206 SelectionDAG &DAG,
14207 unsigned &ShiftValue,
14208 SDValue &RShOperand) {
14209 if (Shift->getOpcode() != ISD::SRL)
14210 return false;
14211
14212 EVT VT = Shift.getValueType();
14213 assert(VT.isScalableVT());
14214
14215 auto ShiftOp1 =
14216 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14217 if (!ShiftOp1)
14218 return false;
14219
14220 ShiftValue = ShiftOp1->getZExtValue();
14221 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14222 return false;
14223
14224 SDValue Add = Shift->getOperand(0);
14225 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14226 return false;
14227
14229 "ResVT must be truncated or same type as the shift.");
14230 // Check if an overflow can lead to incorrect results.
14231 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14232 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14233 return false;
14234
14235 auto AddOp1 =
14236 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14237 if (!AddOp1)
14238 return false;
14239 uint64_t AddValue = AddOp1->getZExtValue();
14240 if (AddValue != 1ULL << (ShiftValue - 1))
14241 return false;
14242
14243 RShOperand = Add->getOperand(0);
14244 return true;
14245}
14246
14247SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14248 SelectionDAG &DAG) const {
14249 EVT VT = Op.getValueType();
14250 SDLoc DL(Op);
14251 int64_t Cnt;
14252
14253 if (!Op.getOperand(1).getValueType().isVector())
14254 return Op;
14255 unsigned EltSize = VT.getScalarSizeInBits();
14256
14257 switch (Op.getOpcode()) {
14258 case ISD::SHL:
14259 if (VT.isScalableVector() ||
14261 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14262
14263 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14264 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14265 DAG.getConstant(Cnt, DL, MVT::i32));
14266 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14267 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14268 MVT::i32),
14269 Op.getOperand(0), Op.getOperand(1));
14270 case ISD::SRA:
14271 case ISD::SRL:
14272 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14273 SDValue RShOperand;
14274 unsigned ShiftValue;
14275 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14276 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14277 getPredicateForVector(DAG, DL, VT), RShOperand,
14278 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14279 }
14280
14281 if (VT.isScalableVector() ||
14282 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14283 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14285 return LowerToPredicatedOp(Op, DAG, Opc);
14286 }
14287
14288 // Right shift immediate
14289 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14290 unsigned Opc =
14291 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14292 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14293 DAG.getConstant(Cnt, DL, MVT::i32));
14294 }
14295
14296 // Right shift register. Note, there is not a shift right register
14297 // instruction, but the shift left register instruction takes a signed
14298 // value, where negative numbers specify a right shift.
14299 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14300 : Intrinsic::aarch64_neon_ushl;
14301 // negate the shift amount
14302 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14303 Op.getOperand(1));
14304 SDValue NegShiftLeft =
14306 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14307 NegShift);
14308 return NegShiftLeft;
14309 }
14310
14311 llvm_unreachable("unexpected shift opcode");
14312}
14313
14315 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14316 const SDLoc &dl, SelectionDAG &DAG) {
14317 EVT SrcVT = LHS.getValueType();
14318 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14319 "function only supposed to emit natural comparisons");
14320
14321 APInt SplatValue;
14322 APInt SplatUndef;
14323 unsigned SplatBitSize = 0;
14324 bool HasAnyUndefs;
14325
14326 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14327 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14328 SplatBitSize, HasAnyUndefs);
14329
14330 bool IsZero = IsCnst && SplatValue == 0;
14331 bool IsOne =
14332 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14333 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14334
14335 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14336 switch (CC) {
14337 default:
14338 return SDValue();
14339 case AArch64CC::NE: {
14340 SDValue Fcmeq;
14341 if (IsZero)
14342 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14343 else
14344 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14345 return DAG.getNOT(dl, Fcmeq, VT);
14346 }
14347 case AArch64CC::EQ:
14348 if (IsZero)
14349 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14350 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14351 case AArch64CC::GE:
14352 if (IsZero)
14353 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14354 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14355 case AArch64CC::GT:
14356 if (IsZero)
14357 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14358 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14359 case AArch64CC::LE:
14360 if (!NoNans)
14361 return SDValue();
14362 // If we ignore NaNs then we can use to the LS implementation.
14363 [[fallthrough]];
14364 case AArch64CC::LS:
14365 if (IsZero)
14366 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14367 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14368 case AArch64CC::LT:
14369 if (!NoNans)
14370 return SDValue();
14371 // If we ignore NaNs then we can use to the MI implementation.
14372 [[fallthrough]];
14373 case AArch64CC::MI:
14374 if (IsZero)
14375 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14376 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14377 }
14378 }
14379
14380 switch (CC) {
14381 default:
14382 return SDValue();
14383 case AArch64CC::NE: {
14384 SDValue Cmeq;
14385 if (IsZero)
14386 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14387 else
14388 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14389 return DAG.getNOT(dl, Cmeq, VT);
14390 }
14391 case AArch64CC::EQ:
14392 if (IsZero)
14393 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14394 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14395 case AArch64CC::GE:
14396 if (IsZero)
14397 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14398 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14399 case AArch64CC::GT:
14400 if (IsZero)
14401 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14402 if (IsMinusOne)
14403 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14404 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14405 case AArch64CC::LE:
14406 if (IsZero)
14407 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14408 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14409 case AArch64CC::LS:
14410 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14411 case AArch64CC::LO:
14412 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14413 case AArch64CC::LT:
14414 if (IsZero)
14415 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14416 if (IsOne)
14417 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14418 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14419 case AArch64CC::HI:
14420 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14421 case AArch64CC::HS:
14422 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14423 }
14424}
14425
14426SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14427 SelectionDAG &DAG) const {
14428 if (Op.getValueType().isScalableVector())
14429 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14430
14431 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14432 !Subtarget->isNeonAvailable()))
14433 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14434
14435 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14436 SDValue LHS = Op.getOperand(0);
14437 SDValue RHS = Op.getOperand(1);
14438 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14439 SDLoc dl(Op);
14440
14441 if (LHS.getValueType().getVectorElementType().isInteger()) {
14442 assert(LHS.getValueType() == RHS.getValueType());
14444 SDValue Cmp =
14445 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14446 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14447 }
14448
14449 // Lower isnan(x) | isnan(never-nan) to x != x.
14450 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14451 if (CC == ISD::SETUO || CC == ISD::SETO) {
14452 bool OneNaN = false;
14453 if (LHS == RHS) {
14454 OneNaN = true;
14455 } else if (DAG.isKnownNeverNaN(RHS)) {
14456 OneNaN = true;
14457 RHS = LHS;
14458 } else if (DAG.isKnownNeverNaN(LHS)) {
14459 OneNaN = true;
14460 LHS = RHS;
14461 }
14462 if (OneNaN) {
14464 }
14465 }
14466
14467 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14468
14469 // Make v4f16 (only) fcmp operations utilise vector instructions
14470 // v8f16 support will be a litle more complicated
14471 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14472 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14473 if (LHS.getValueType().getVectorNumElements() == 4) {
14474 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14475 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14476 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14477 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14478 CmpVT = MVT::v4i32;
14479 } else
14480 return SDValue();
14481 }
14482
14483 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14484 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14485 LHS.getValueType().getVectorElementType() != MVT::f128);
14486
14487 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14488 // clean. Some of them require two branches to implement.
14489 AArch64CC::CondCode CC1, CC2;
14490 bool ShouldInvert;
14491 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14492
14493 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14494 SDValue Cmp =
14495 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14496 if (!Cmp.getNode())
14497 return SDValue();
14498
14499 if (CC2 != AArch64CC::AL) {
14500 SDValue Cmp2 =
14501 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14502 if (!Cmp2.getNode())
14503 return SDValue();
14504
14505 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14506 }
14507
14508 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14509
14510 if (ShouldInvert)
14511 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14512
14513 return Cmp;
14514}
14515
14516static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14517 SelectionDAG &DAG) {
14518 SDValue VecOp = ScalarOp.getOperand(0);
14519 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14520 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14521 DAG.getConstant(0, DL, MVT::i64));
14522}
14523
14524static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14525 SDLoc DL, SelectionDAG &DAG) {
14526 unsigned ScalarOpcode;
14527 switch (Opcode) {
14528 case ISD::VECREDUCE_AND:
14529 ScalarOpcode = ISD::AND;
14530 break;
14531 case ISD::VECREDUCE_OR:
14532 ScalarOpcode = ISD::OR;
14533 break;
14534 case ISD::VECREDUCE_XOR:
14535 ScalarOpcode = ISD::XOR;
14536 break;
14537 default:
14538 llvm_unreachable("Expected bitwise vector reduction");
14539 return SDValue();
14540 }
14541
14542 EVT VecVT = Vec.getValueType();
14543 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14544 "Expected power-of-2 length vector");
14545
14546 EVT ElemVT = VecVT.getVectorElementType();
14547
14548 SDValue Result;
14549 unsigned NumElems = VecVT.getVectorNumElements();
14550
14551 // Special case for boolean reductions
14552 if (ElemVT == MVT::i1) {
14553 // Split large vectors into smaller ones
14554 if (NumElems > 16) {
14555 SDValue Lo, Hi;
14556 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14557 EVT HalfVT = Lo.getValueType();
14558 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14559 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14560 }
14561
14562 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14563 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14564 // this element size leads to the best codegen, since e.g. setcc results
14565 // might need to be truncated otherwise.
14566 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14567
14568 // any_ext doesn't work with umin/umax, so only use it for uadd.
14569 unsigned ExtendOp =
14570 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14571 SDValue Extended = DAG.getNode(
14572 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14573 switch (ScalarOpcode) {
14574 case ISD::AND:
14575 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14576 break;
14577 case ISD::OR:
14578 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14579 break;
14580 case ISD::XOR:
14581 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14582 break;
14583 default:
14584 llvm_unreachable("Unexpected Opcode");
14585 }
14586
14587 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14588 } else {
14589 // Iteratively split the vector in half and combine using the bitwise
14590 // operation until it fits in a 64 bit register.
14591 while (VecVT.getSizeInBits() > 64) {
14592 SDValue Lo, Hi;
14593 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14594 VecVT = Lo.getValueType();
14595 NumElems = VecVT.getVectorNumElements();
14596 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14597 }
14598
14599 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14600
14601 // Do the remaining work on a scalar since it allows the code generator to
14602 // combine the shift and bitwise operation into one instruction and since
14603 // integer instructions can have higher throughput than vector instructions.
14604 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14605
14606 // Iteratively combine the lower and upper halves of the scalar using the
14607 // bitwise operation, halving the relevant region of the scalar in each
14608 // iteration, until the relevant region is just one element of the original
14609 // vector.
14610 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14611 SDValue ShiftAmount =
14612 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14613 SDValue Shifted =
14614 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14615 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14616 }
14617
14618 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14619 }
14620
14621 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14622}
14623
14624SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14625 SelectionDAG &DAG) const {
14626 SDValue Src = Op.getOperand(0);
14627
14628 // Try to lower fixed length reductions to SVE.
14629 EVT SrcVT = Src.getValueType();
14630 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14631 Op.getOpcode() == ISD::VECREDUCE_AND ||
14632 Op.getOpcode() == ISD::VECREDUCE_OR ||
14633 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14634 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14635 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14636 SrcVT.getVectorElementType() == MVT::i64);
14637 if (SrcVT.isScalableVector() ||
14639 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14640
14641 if (SrcVT.getVectorElementType() == MVT::i1)
14642 return LowerPredReductionToSVE(Op, DAG);
14643
14644 switch (Op.getOpcode()) {
14645 case ISD::VECREDUCE_ADD:
14646 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14647 case ISD::VECREDUCE_AND:
14648 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14649 case ISD::VECREDUCE_OR:
14650 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14652 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14654 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14656 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14658 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14659 case ISD::VECREDUCE_XOR:
14660 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14662 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14664 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14666 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14668 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14670 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14671 default:
14672 llvm_unreachable("Unhandled fixed length reduction");
14673 }
14674 }
14675
14676 // Lower NEON reductions.
14677 SDLoc dl(Op);
14678 switch (Op.getOpcode()) {
14679 case ISD::VECREDUCE_AND:
14680 case ISD::VECREDUCE_OR:
14681 case ISD::VECREDUCE_XOR:
14682 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14683 Op.getValueType(), dl, DAG);
14684 case ISD::VECREDUCE_ADD:
14685 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14687 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14689 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14691 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14693 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14694 default:
14695 llvm_unreachable("Unhandled reduction");
14696 }
14697}
14698
14699SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14700 SelectionDAG &DAG) const {
14701 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14702 // No point replacing if we don't have the relevant instruction/libcall anyway
14703 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14704 return SDValue();
14705
14706 // LSE has an atomic load-clear instruction, but not a load-and.
14707 SDLoc dl(Op);
14708 MVT VT = Op.getSimpleValueType();
14709 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14710 SDValue RHS = Op.getOperand(2);
14711 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14712 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14713 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14714 Op.getOperand(0), Op.getOperand(1), RHS,
14715 AN->getMemOperand());
14716}
14717
14718SDValue
14719AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14720 SelectionDAG &DAG) const {
14721
14722 SDLoc dl(Op);
14723 // Get the inputs.
14724 SDNode *Node = Op.getNode();
14725 SDValue Chain = Op.getOperand(0);
14726 SDValue Size = Op.getOperand(1);
14728 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14729 EVT VT = Node->getValueType(0);
14730
14732 "no-stack-arg-probe")) {
14733 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14734 Chain = SP.getValue(1);
14735 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14736 if (Align)
14737 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14738 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14739 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14740 SDValue Ops[2] = {SP, Chain};
14741 return DAG.getMergeValues(Ops, dl);
14742 }
14743
14744 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14745
14746 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14748 PtrVT, 0);
14749
14750 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14751 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14752 if (Subtarget->hasCustomCallingConv())
14753 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14754
14755 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14756 DAG.getConstant(4, dl, MVT::i64));
14757 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14758 Chain =
14759 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14760 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14761 DAG.getRegisterMask(Mask), Chain.getValue(1));
14762 // To match the actual intent better, we should read the output from X15 here
14763 // again (instead of potentially spilling it to the stack), but rereading Size
14764 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14765 // here.
14766
14767 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14768 DAG.getConstant(4, dl, MVT::i64));
14769
14770 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14771 Chain = SP.getValue(1);
14772 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14773 if (Align)
14774 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14775 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14776 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14777
14778 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14779
14780 SDValue Ops[2] = {SP, Chain};
14781 return DAG.getMergeValues(Ops, dl);
14782}
14783
14784SDValue
14785AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14786 SelectionDAG &DAG) const {
14787 // Get the inputs.
14788 SDNode *Node = Op.getNode();
14789 SDValue Chain = Op.getOperand(0);
14790 SDValue Size = Op.getOperand(1);
14791
14793 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14794 SDLoc dl(Op);
14795 EVT VT = Node->getValueType(0);
14796
14797 // Construct the new SP value in a GPR.
14798 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14799 Chain = SP.getValue(1);
14800 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14801 if (Align)
14802 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14803 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14804
14805 // Set the real SP to the new value with a probing loop.
14806 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14807 SDValue Ops[2] = {SP, Chain};
14808 return DAG.getMergeValues(Ops, dl);
14809}
14810
14811SDValue
14812AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14813 SelectionDAG &DAG) const {
14815
14816 if (Subtarget->isTargetWindows())
14817 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14818 else if (hasInlineStackProbe(MF))
14819 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14820 else
14821 return SDValue();
14822}
14823
14824// When x and y are extended, lower:
14825// avgfloor(x, y) -> (x + y) >> 1
14826// avgceil(x, y) -> (x + y + 1) >> 1
14827
14828// Otherwise, lower to:
14829// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14830// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14831SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14832 unsigned NewOp) const {
14833 if (Subtarget->hasSVE2())
14834 return LowerToPredicatedOp(Op, DAG, NewOp);
14835
14836 SDLoc dl(Op);
14837 SDValue OpA = Op->getOperand(0);
14838 SDValue OpB = Op->getOperand(1);
14839 EVT VT = Op.getValueType();
14840 bool IsCeil =
14841 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14842 bool IsSigned =
14843 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14844 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14845
14846 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14847
14848 auto IsZeroExtended = [&DAG](SDValue &Node) {
14849 KnownBits Known = DAG.computeKnownBits(Node, 0);
14850 return Known.Zero.isSignBitSet();
14851 };
14852
14853 auto IsSignExtended = [&DAG](SDValue &Node) {
14854 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14855 };
14856
14857 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14858 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14859 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14860 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14861 if (IsCeil)
14862 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14863 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14864 }
14865
14866 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14867 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14868
14869 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14870 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14871 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14872 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14873}
14874
14875SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14876 SelectionDAG &DAG) const {
14877 EVT VT = Op.getValueType();
14878 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14879
14880 SDLoc DL(Op);
14881 APInt MulImm = Op.getConstantOperandAPInt(0);
14882 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14883 VT);
14884}
14885
14886/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14887template <unsigned NumVecs>
14888static bool
14892 // Retrieve EC from first vector argument.
14893 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14895#ifndef NDEBUG
14896 // Check the assumption that all input vectors are the same type.
14897 for (unsigned I = 0; I < NumVecs; ++I)
14898 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14899 "Invalid type.");
14900#endif
14901 // memVT is `NumVecs * VT`.
14903 EC * NumVecs);
14904 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14905 Info.offset = 0;
14906 Info.align.reset();
14908 return true;
14909}
14910
14911/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14912/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14913/// specified in the intrinsic calls.
14915 const CallInst &I,
14916 MachineFunction &MF,
14917 unsigned Intrinsic) const {
14918 auto &DL = I.getModule()->getDataLayout();
14919 switch (Intrinsic) {
14920 case Intrinsic::aarch64_sve_st2:
14921 return setInfoSVEStN<2>(*this, DL, Info, I);
14922 case Intrinsic::aarch64_sve_st3:
14923 return setInfoSVEStN<3>(*this, DL, Info, I);
14924 case Intrinsic::aarch64_sve_st4:
14925 return setInfoSVEStN<4>(*this, DL, Info, I);
14926 case Intrinsic::aarch64_neon_ld2:
14927 case Intrinsic::aarch64_neon_ld3:
14928 case Intrinsic::aarch64_neon_ld4:
14929 case Intrinsic::aarch64_neon_ld1x2:
14930 case Intrinsic::aarch64_neon_ld1x3:
14931 case Intrinsic::aarch64_neon_ld1x4: {
14933 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14934 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14935 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14936 Info.offset = 0;
14937 Info.align.reset();
14938 // volatile loads with NEON intrinsics not supported
14940 return true;
14941 }
14942 case Intrinsic::aarch64_neon_ld2lane:
14943 case Intrinsic::aarch64_neon_ld3lane:
14944 case Intrinsic::aarch64_neon_ld4lane:
14945 case Intrinsic::aarch64_neon_ld2r:
14946 case Intrinsic::aarch64_neon_ld3r:
14947 case Intrinsic::aarch64_neon_ld4r: {
14949 // ldx return struct with the same vec type
14950 Type *RetTy = I.getType();
14951 auto *StructTy = cast<StructType>(RetTy);
14952 unsigned NumElts = StructTy->getNumElements();
14953 Type *VecTy = StructTy->getElementType(0);
14954 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14955 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14956 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14957 Info.offset = 0;
14958 Info.align.reset();
14959 // volatile loads with NEON intrinsics not supported
14961 return true;
14962 }
14963 case Intrinsic::aarch64_neon_st2:
14964 case Intrinsic::aarch64_neon_st3:
14965 case Intrinsic::aarch64_neon_st4:
14966 case Intrinsic::aarch64_neon_st1x2:
14967 case Intrinsic::aarch64_neon_st1x3:
14968 case Intrinsic::aarch64_neon_st1x4: {
14970 unsigned NumElts = 0;
14971 for (const Value *Arg : I.args()) {
14972 Type *ArgTy = Arg->getType();
14973 if (!ArgTy->isVectorTy())
14974 break;
14975 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14976 }
14977 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14978 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14979 Info.offset = 0;
14980 Info.align.reset();
14981 // volatile stores with NEON intrinsics not supported
14983 return true;
14984 }
14985 case Intrinsic::aarch64_neon_st2lane:
14986 case Intrinsic::aarch64_neon_st3lane:
14987 case Intrinsic::aarch64_neon_st4lane: {
14989 unsigned NumElts = 0;
14990 // all the vector type is same
14991 Type *VecTy = I.getArgOperand(0)->getType();
14992 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14993
14994 for (const Value *Arg : I.args()) {
14995 Type *ArgTy = Arg->getType();
14996 if (!ArgTy->isVectorTy())
14997 break;
14998 NumElts += 1;
14999 }
15000
15001 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15002 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15003 Info.offset = 0;
15004 Info.align.reset();
15005 // volatile stores with NEON intrinsics not supported
15007 return true;
15008 }
15009 case Intrinsic::aarch64_ldaxr:
15010 case Intrinsic::aarch64_ldxr: {
15011 Type *ValTy = I.getParamElementType(0);
15013 Info.memVT = MVT::getVT(ValTy);
15014 Info.ptrVal = I.getArgOperand(0);
15015 Info.offset = 0;
15016 Info.align = DL.getABITypeAlign(ValTy);
15018 return true;
15019 }
15020 case Intrinsic::aarch64_stlxr:
15021 case Intrinsic::aarch64_stxr: {
15022 Type *ValTy = I.getParamElementType(1);
15024 Info.memVT = MVT::getVT(ValTy);
15025 Info.ptrVal = I.getArgOperand(1);
15026 Info.offset = 0;
15027 Info.align = DL.getABITypeAlign(ValTy);
15029 return true;
15030 }
15031 case Intrinsic::aarch64_ldaxp:
15032 case Intrinsic::aarch64_ldxp:
15034 Info.memVT = MVT::i128;
15035 Info.ptrVal = I.getArgOperand(0);
15036 Info.offset = 0;
15037 Info.align = Align(16);
15039 return true;
15040 case Intrinsic::aarch64_stlxp:
15041 case Intrinsic::aarch64_stxp:
15043 Info.memVT = MVT::i128;
15044 Info.ptrVal = I.getArgOperand(2);
15045 Info.offset = 0;
15046 Info.align = Align(16);
15048 return true;
15049 case Intrinsic::aarch64_sve_ldnt1: {
15050 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15052 Info.memVT = MVT::getVT(I.getType());
15053 Info.ptrVal = I.getArgOperand(1);
15054 Info.offset = 0;
15055 Info.align = DL.getABITypeAlign(ElTy);
15057 return true;
15058 }
15059 case Intrinsic::aarch64_sve_stnt1: {
15060 Type *ElTy =
15061 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15063 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15064 Info.ptrVal = I.getArgOperand(2);
15065 Info.offset = 0;
15066 Info.align = DL.getABITypeAlign(ElTy);
15068 return true;
15069 }
15070 case Intrinsic::aarch64_mops_memset_tag: {
15071 Value *Dst = I.getArgOperand(0);
15072 Value *Val = I.getArgOperand(1);
15074 Info.memVT = MVT::getVT(Val->getType());
15075 Info.ptrVal = Dst;
15076 Info.offset = 0;
15077 Info.align = I.getParamAlign(0).valueOrOne();
15079 // The size of the memory being operated on is unknown at this point
15081 return true;
15082 }
15083 default:
15084 break;
15085 }
15086
15087 return false;
15088}
15089
15091 ISD::LoadExtType ExtTy,
15092 EVT NewVT) const {
15093 // TODO: This may be worth removing. Check regression tests for diffs.
15094 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15095 return false;
15096
15097 // If we're reducing the load width in order to avoid having to use an extra
15098 // instruction to do extension then it's probably a good idea.
15099 if (ExtTy != ISD::NON_EXTLOAD)
15100 return true;
15101 // Don't reduce load width if it would prevent us from combining a shift into
15102 // the offset.
15103 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15104 assert(Mem);
15105 const SDValue &Base = Mem->getBasePtr();
15106 if (Base.getOpcode() == ISD::ADD &&
15107 Base.getOperand(1).getOpcode() == ISD::SHL &&
15108 Base.getOperand(1).hasOneUse() &&
15109 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15110 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15111 if (Mem->getMemoryVT().isScalableVector())
15112 return false;
15113 // The shift can be combined if it matches the size of the value being
15114 // loaded (and so reducing the width would make it not match).
15115 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15116 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15117 if (ShiftAmount == Log2_32(LoadBytes))
15118 return false;
15119 }
15120 // We have no reason to disallow reducing the load width, so allow it.
15121 return true;
15122}
15123
15124// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15126 EVT VT = Extend.getValueType();
15127 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15128 SDValue Extract = Extend.getOperand(0);
15129 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15130 Extract = Extract.getOperand(0);
15131 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15132 EVT VecVT = Extract.getOperand(0).getValueType();
15133 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15134 return false;
15135 }
15136 }
15137 return true;
15138}
15139
15140// Truncations from 64-bit GPR to 32-bit GPR is free.
15142 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15143 return false;
15144 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15145 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15146 return NumBits1 > NumBits2;
15147}
15149 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15150 return false;
15151 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15152 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15153 return NumBits1 > NumBits2;
15154}
15155
15156/// Check if it is profitable to hoist instruction in then/else to if.
15157/// Not profitable if I and it's user can form a FMA instruction
15158/// because we prefer FMSUB/FMADD.
15160 if (I->getOpcode() != Instruction::FMul)
15161 return true;
15162
15163 if (!I->hasOneUse())
15164 return true;
15165
15166 Instruction *User = I->user_back();
15167
15168 if (!(User->getOpcode() == Instruction::FSub ||
15169 User->getOpcode() == Instruction::FAdd))
15170 return true;
15171
15173 const Function *F = I->getFunction();
15174 const DataLayout &DL = F->getParent()->getDataLayout();
15175 Type *Ty = User->getOperand(0)->getType();
15176
15177 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15179 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15180 Options.UnsafeFPMath));
15181}
15182
15183// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15184// 64-bit GPR.
15186 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15187 return false;
15188 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15189 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15190 return NumBits1 == 32 && NumBits2 == 64;
15191}
15193 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15194 return false;
15195 unsigned NumBits1 = VT1.getSizeInBits();
15196 unsigned NumBits2 = VT2.getSizeInBits();
15197 return NumBits1 == 32 && NumBits2 == 64;
15198}
15199
15201 EVT VT1 = Val.getValueType();
15202 if (isZExtFree(VT1, VT2)) {
15203 return true;
15204 }
15205
15206 if (Val.getOpcode() != ISD::LOAD)
15207 return false;
15208
15209 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15210 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15211 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15212 VT1.getSizeInBits() <= 32);
15213}
15214
15215bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15216 if (isa<FPExtInst>(Ext))
15217 return false;
15218
15219 // Vector types are not free.
15220 if (Ext->getType()->isVectorTy())
15221 return false;
15222
15223 for (const Use &U : Ext->uses()) {
15224 // The extension is free if we can fold it with a left shift in an
15225 // addressing mode or an arithmetic operation: add, sub, and cmp.
15226
15227 // Is there a shift?
15228 const Instruction *Instr = cast<Instruction>(U.getUser());
15229
15230 // Is this a constant shift?
15231 switch (Instr->getOpcode()) {
15232 case Instruction::Shl:
15233 if (!isa<ConstantInt>(Instr->getOperand(1)))
15234 return false;
15235 break;
15236 case Instruction::GetElementPtr: {
15237 gep_type_iterator GTI = gep_type_begin(Instr);
15238 auto &DL = Ext->getModule()->getDataLayout();
15239 std::advance(GTI, U.getOperandNo()-1);
15240 Type *IdxTy = GTI.getIndexedType();
15241 // This extension will end up with a shift because of the scaling factor.
15242 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15243 // Get the shift amount based on the scaling factor:
15244 // log2(sizeof(IdxTy)) - log2(8).
15245 if (IdxTy->isScalableTy())
15246 return false;
15247 uint64_t ShiftAmt =
15248 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15249 3;
15250 // Is the constant foldable in the shift of the addressing mode?
15251 // I.e., shift amount is between 1 and 4 inclusive.
15252 if (ShiftAmt == 0 || ShiftAmt > 4)
15253 return false;
15254 break;
15255 }
15256 case Instruction::Trunc:
15257 // Check if this is a noop.
15258 // trunc(sext ty1 to ty2) to ty1.
15259 if (Instr->getType() == Ext->getOperand(0)->getType())
15260 continue;
15261 [[fallthrough]];
15262 default:
15263 return false;
15264 }
15265
15266 // At this point we can use the bfm family, so this extension is free
15267 // for that use.
15268 }
15269 return true;
15270}
15271
15272static bool isSplatShuffle(Value *V) {
15273 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15274 return all_equal(Shuf->getShuffleMask());
15275 return false;
15276}
15277
15278/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15279/// or upper half of the vector elements.
15280static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15281 bool AllowSplat = false) {
15282 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15283 auto *FullTy = FullV->getType();
15284 auto *HalfTy = HalfV->getType();
15285 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15286 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15287 };
15288
15289 auto extractHalf = [](Value *FullV, Value *HalfV) {
15290 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15291 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15292 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15293 };
15294
15295 ArrayRef<int> M1, M2;
15296 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15297 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15298 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15299 return false;
15300
15301 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15302 // it is not checked as an extract below.
15303 if (AllowSplat && isSplatShuffle(Op1))
15304 S1Op1 = nullptr;
15305 if (AllowSplat && isSplatShuffle(Op2))
15306 S2Op1 = nullptr;
15307
15308 // Check that the operands are half as wide as the result and we extract
15309 // half of the elements of the input vectors.
15310 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15311 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15312 return false;
15313
15314 // Check the mask extracts either the lower or upper half of vector
15315 // elements.
15316 int M1Start = 0;
15317 int M2Start = 0;
15318 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15319 if ((S1Op1 &&
15320 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15321 (S2Op1 &&
15322 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15323 return false;
15324
15325 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15326 (M2Start != 0 && M2Start != (NumElements / 2)))
15327 return false;
15328 if (S1Op1 && S2Op1 && M1Start != M2Start)
15329 return false;
15330
15331 return true;
15332}
15333
15334/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15335/// of the vector elements.
15336static bool areExtractExts(Value *Ext1, Value *Ext2) {
15337 auto areExtDoubled = [](Instruction *Ext) {
15338 return Ext->getType()->getScalarSizeInBits() ==
15339 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15340 };
15341
15342 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15343 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15344 !areExtDoubled(cast<Instruction>(Ext1)) ||
15345 !areExtDoubled(cast<Instruction>(Ext2)))
15346 return false;
15347
15348 return true;
15349}
15350
15351/// Check if Op could be used with vmull_high_p64 intrinsic.
15353 Value *VectorOperand = nullptr;
15354 ConstantInt *ElementIndex = nullptr;
15355 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15356 m_ConstantInt(ElementIndex))) &&
15357 ElementIndex->getValue() == 1 &&
15358 isa<FixedVectorType>(VectorOperand->getType()) &&
15359 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15360}
15361
15362/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15363static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15365}
15366
15368 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15369 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15370 if (!GEP || GEP->getNumOperands() != 2)
15371 return false;
15372
15373 Value *Base = GEP->getOperand(0);
15374 Value *Offsets = GEP->getOperand(1);
15375
15376 // We only care about scalar_base+vector_offsets.
15377 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15378 return false;
15379
15380 // Sink extends that would allow us to use 32-bit offset vectors.
15381 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15382 auto *OffsetsInst = cast<Instruction>(Offsets);
15383 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15384 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15385 Ops.push_back(&GEP->getOperandUse(1));
15386 }
15387
15388 // Sink the GEP.
15389 return true;
15390}
15391
15392/// We want to sink following cases:
15393/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15395 if (match(Op, m_VScale()))
15396 return true;
15397 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15399 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15400 return true;
15401 }
15402 return false;
15403}
15404
15405/// Check if sinking \p I's operands to I's basic block is profitable, because
15406/// the operands can be folded into a target instruction, e.g.
15407/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15409 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15410 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15411 switch (II->getIntrinsicID()) {
15412 case Intrinsic::aarch64_neon_smull:
15413 case Intrinsic::aarch64_neon_umull:
15414 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15415 /*AllowSplat=*/true)) {
15416 Ops.push_back(&II->getOperandUse(0));
15417 Ops.push_back(&II->getOperandUse(1));
15418 return true;
15419 }
15420 [[fallthrough]];
15421
15422 case Intrinsic::fma:
15423 if (isa<VectorType>(I->getType()) &&
15424 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15425 !Subtarget->hasFullFP16())
15426 return false;
15427 [[fallthrough]];
15428 case Intrinsic::aarch64_neon_sqdmull:
15429 case Intrinsic::aarch64_neon_sqdmulh:
15430 case Intrinsic::aarch64_neon_sqrdmulh:
15431 // Sink splats for index lane variants
15432 if (isSplatShuffle(II->getOperand(0)))
15433 Ops.push_back(&II->getOperandUse(0));
15434 if (isSplatShuffle(II->getOperand(1)))
15435 Ops.push_back(&II->getOperandUse(1));
15436 return !Ops.empty();
15437 case Intrinsic::aarch64_neon_fmlal:
15438 case Intrinsic::aarch64_neon_fmlal2:
15439 case Intrinsic::aarch64_neon_fmlsl:
15440 case Intrinsic::aarch64_neon_fmlsl2:
15441 // Sink splats for index lane variants
15442 if (isSplatShuffle(II->getOperand(1)))
15443 Ops.push_back(&II->getOperandUse(1));
15444 if (isSplatShuffle(II->getOperand(2)))
15445 Ops.push_back(&II->getOperandUse(2));
15446 return !Ops.empty();
15447 case Intrinsic::aarch64_sve_ptest_first:
15448 case Intrinsic::aarch64_sve_ptest_last:
15449 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15450 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15451 Ops.push_back(&II->getOperandUse(0));
15452 return !Ops.empty();
15453 case Intrinsic::aarch64_sme_write_horiz:
15454 case Intrinsic::aarch64_sme_write_vert:
15455 case Intrinsic::aarch64_sme_writeq_horiz:
15456 case Intrinsic::aarch64_sme_writeq_vert: {
15457 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15458 if (!Idx || Idx->getOpcode() != Instruction::Add)
15459 return false;
15460 Ops.push_back(&II->getOperandUse(1));
15461 return true;
15462 }
15463 case Intrinsic::aarch64_sme_read_horiz:
15464 case Intrinsic::aarch64_sme_read_vert:
15465 case Intrinsic::aarch64_sme_readq_horiz:
15466 case Intrinsic::aarch64_sme_readq_vert:
15467 case Intrinsic::aarch64_sme_ld1b_vert:
15468 case Intrinsic::aarch64_sme_ld1h_vert:
15469 case Intrinsic::aarch64_sme_ld1w_vert:
15470 case Intrinsic::aarch64_sme_ld1d_vert:
15471 case Intrinsic::aarch64_sme_ld1q_vert:
15472 case Intrinsic::aarch64_sme_st1b_vert:
15473 case Intrinsic::aarch64_sme_st1h_vert:
15474 case Intrinsic::aarch64_sme_st1w_vert:
15475 case Intrinsic::aarch64_sme_st1d_vert:
15476 case Intrinsic::aarch64_sme_st1q_vert:
15477 case Intrinsic::aarch64_sme_ld1b_horiz:
15478 case Intrinsic::aarch64_sme_ld1h_horiz:
15479 case Intrinsic::aarch64_sme_ld1w_horiz:
15480 case Intrinsic::aarch64_sme_ld1d_horiz:
15481 case Intrinsic::aarch64_sme_ld1q_horiz:
15482 case Intrinsic::aarch64_sme_st1b_horiz:
15483 case Intrinsic::aarch64_sme_st1h_horiz:
15484 case Intrinsic::aarch64_sme_st1w_horiz:
15485 case Intrinsic::aarch64_sme_st1d_horiz:
15486 case Intrinsic::aarch64_sme_st1q_horiz: {
15487 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15488 if (!Idx || Idx->getOpcode() != Instruction::Add)
15489 return false;
15490 Ops.push_back(&II->getOperandUse(3));
15491 return true;
15492 }
15493 case Intrinsic::aarch64_neon_pmull:
15494 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15495 return false;
15496 Ops.push_back(&II->getOperandUse(0));
15497 Ops.push_back(&II->getOperandUse(1));
15498 return true;
15499 case Intrinsic::aarch64_neon_pmull64:
15500 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15501 II->getArgOperand(1)))
15502 return false;
15503 Ops.push_back(&II->getArgOperandUse(0));
15504 Ops.push_back(&II->getArgOperandUse(1));
15505 return true;
15506 case Intrinsic::masked_gather:
15507 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15508 return false;
15509 Ops.push_back(&II->getArgOperandUse(0));
15510 return true;
15511 case Intrinsic::masked_scatter:
15512 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15513 return false;
15514 Ops.push_back(&II->getArgOperandUse(1));
15515 return true;
15516 default:
15517 return false;
15518 }
15519 }
15520
15521 // Sink vscales closer to uses for better isel
15522 switch (I->getOpcode()) {
15523 case Instruction::GetElementPtr:
15524 case Instruction::Add:
15525 case Instruction::Sub:
15526 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15527 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15528 Ops.push_back(&I->getOperandUse(Op));
15529 return true;
15530 }
15531 }
15532 break;
15533 default:
15534 break;
15535 }
15536
15537 if (!I->getType()->isVectorTy())
15538 return false;
15539
15540 switch (I->getOpcode()) {
15541 case Instruction::Sub:
15542 case Instruction::Add: {
15543 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15544 return false;
15545
15546 // If the exts' operands extract either the lower or upper elements, we
15547 // can sink them too.
15548 auto Ext1 = cast<Instruction>(I->getOperand(0));
15549 auto Ext2 = cast<Instruction>(I->getOperand(1));
15550 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15551 Ops.push_back(&Ext1->getOperandUse(0));
15552 Ops.push_back(&Ext2->getOperandUse(0));
15553 }
15554
15555 Ops.push_back(&I->getOperandUse(0));
15556 Ops.push_back(&I->getOperandUse(1));
15557
15558 return true;
15559 }
15560 case Instruction::Or: {
15561 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15562 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15563 if (Subtarget->hasNEON()) {
15564 Instruction *OtherAnd, *IA, *IB;
15565 Value *MaskValue;
15566 // MainAnd refers to And instruction that has 'Not' as one of its operands
15567 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15568 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15569 m_Instruction(IA)))))) {
15570 if (match(OtherAnd,
15571 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15572 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15573 ? cast<Instruction>(I->getOperand(1))
15574 : cast<Instruction>(I->getOperand(0));
15575
15576 // Both Ands should be in same basic block as Or
15577 if (I->getParent() != MainAnd->getParent() ||
15578 I->getParent() != OtherAnd->getParent())
15579 return false;
15580
15581 // Non-mask operands of both Ands should also be in same basic block
15582 if (I->getParent() != IA->getParent() ||
15583 I->getParent() != IB->getParent())
15584 return false;
15585
15586 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15587 Ops.push_back(&I->getOperandUse(0));
15588 Ops.push_back(&I->getOperandUse(1));
15589
15590 return true;
15591 }
15592 }
15593 }
15594
15595 return false;
15596 }
15597 case Instruction::Mul: {
15598 int NumZExts = 0, NumSExts = 0;
15599 for (auto &Op : I->operands()) {
15600 // Make sure we are not already sinking this operand
15601 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15602 continue;
15603
15604 if (match(&Op, m_SExt(m_Value()))) {
15605 NumSExts++;
15606 continue;
15607 } else if (match(&Op, m_ZExt(m_Value()))) {
15608 NumZExts++;
15609 continue;
15610 }
15611
15612 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15613
15614 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15615 // operand and the s/zext can help create indexed s/umull. This is
15616 // especially useful to prevent i64 mul being scalarized.
15617 if (Shuffle && isSplatShuffle(Shuffle) &&
15618 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15619 Ops.push_back(&Shuffle->getOperandUse(0));
15620 Ops.push_back(&Op);
15621 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15622 NumSExts++;
15623 else
15624 NumZExts++;
15625 continue;
15626 }
15627
15628 if (!Shuffle)
15629 continue;
15630
15631 Value *ShuffleOperand = Shuffle->getOperand(0);
15632 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15633 if (!Insert)
15634 continue;
15635
15636 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15637 if (!OperandInstr)
15638 continue;
15639
15640 ConstantInt *ElementConstant =
15641 dyn_cast<ConstantInt>(Insert->getOperand(2));
15642 // Check that the insertelement is inserting into element 0
15643 if (!ElementConstant || !ElementConstant->isZero())
15644 continue;
15645
15646 unsigned Opcode = OperandInstr->getOpcode();
15647 if (Opcode == Instruction::SExt)
15648 NumSExts++;
15649 else if (Opcode == Instruction::ZExt)
15650 NumZExts++;
15651 else {
15652 // If we find that the top bits are known 0, then we can sink and allow
15653 // the backend to generate a umull.
15654 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15655 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15656 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15657 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15658 continue;
15659 NumZExts++;
15660 }
15661
15662 Ops.push_back(&Shuffle->getOperandUse(0));
15663 Ops.push_back(&Op);
15664 }
15665
15666 // Is it profitable to sink if we found two of the same type of extends.
15667 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15668 }
15669 default:
15670 return false;
15671 }
15672 return false;
15673}
15674
15676 bool IsLittleEndian) {
15677 Value *Op = ZExt->getOperand(0);
15678 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15679 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15680 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15681 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15682 return false;
15683
15684 assert(DstWidth % SrcWidth == 0 &&
15685 "TBL lowering is not supported for a ZExt instruction with this "
15686 "source & destination element type.");
15687 unsigned ZExtFactor = DstWidth / SrcWidth;
15688 unsigned NumElts = SrcTy->getNumElements();
15689 IRBuilder<> Builder(ZExt);
15690 SmallVector<int> Mask;
15691 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15692 // vector to replace the original ZExt. This can later be lowered to a set of
15693 // tbl instructions.
15694 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15695 if (IsLittleEndian) {
15696 if (i % ZExtFactor == 0)
15697 Mask.push_back(i / ZExtFactor);
15698 else
15699 Mask.push_back(NumElts);
15700 } else {
15701 if ((i + 1) % ZExtFactor == 0)
15702 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15703 else
15704 Mask.push_back(NumElts);
15705 }
15706 }
15707
15708 auto *FirstEltZero = Builder.CreateInsertElement(
15709 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15710 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15711 Result = Builder.CreateBitCast(Result, DstTy);
15712 if (DstTy != ZExt->getType())
15713 Result = Builder.CreateZExt(Result, ZExt->getType());
15714 ZExt->replaceAllUsesWith(Result);
15715 ZExt->eraseFromParent();
15716 return true;
15717}
15718
15719static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15720 IRBuilder<> Builder(TI);
15722 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15723 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15724 auto *DstTy = cast<FixedVectorType>(TI->getType());
15725 assert(SrcTy->getElementType()->isIntegerTy() &&
15726 "Non-integer type source vector element is not supported");
15727 assert(DstTy->getElementType()->isIntegerTy(8) &&
15728 "Unsupported destination vector element type");
15729 unsigned SrcElemTySz =
15730 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15731 unsigned DstElemTySz =
15732 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15733 assert((SrcElemTySz % DstElemTySz == 0) &&
15734 "Cannot lower truncate to tbl instructions for a source element size "
15735 "that is not divisible by the destination element size");
15736 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15737 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15738 "Unsupported source vector element type size");
15739 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15740
15741 // Create a mask to choose every nth byte from the source vector table of
15742 // bytes to create the truncated destination vector, where 'n' is the truncate
15743 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15744 // 0,8,16,..Y*8th bytes for the little-endian format
15746 for (int Itr = 0; Itr < 16; Itr++) {
15747 if (Itr < NumElements)
15748 MaskConst.push_back(Builder.getInt8(
15749 IsLittleEndian ? Itr * TruncFactor
15750 : Itr * TruncFactor + (TruncFactor - 1)));
15751 else
15752 MaskConst.push_back(Builder.getInt8(255));
15753 }
15754
15755 int MaxTblSz = 128 * 4;
15756 int MaxSrcSz = SrcElemTySz * NumElements;
15757 int ElemsPerTbl =
15758 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15759 assert(ElemsPerTbl <= 16 &&
15760 "Maximum elements selected using TBL instruction cannot exceed 16!");
15761
15762 int ShuffleCount = 128 / SrcElemTySz;
15763 SmallVector<int> ShuffleLanes;
15764 for (int i = 0; i < ShuffleCount; ++i)
15765 ShuffleLanes.push_back(i);
15766
15767 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15768 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15769 // call TBL & save the result in a vector of TBL results for combining later.
15771 while (ShuffleLanes.back() < NumElements) {
15772 Parts.push_back(Builder.CreateBitCast(
15773 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15774
15775 if (Parts.size() == 4) {
15777 Intrinsic::aarch64_neon_tbl4, VecTy);
15778 Parts.push_back(ConstantVector::get(MaskConst));
15779 Results.push_back(Builder.CreateCall(F, Parts));
15780 Parts.clear();
15781 }
15782
15783 for (int i = 0; i < ShuffleCount; ++i)
15784 ShuffleLanes[i] += ShuffleCount;
15785 }
15786
15787 assert((Parts.empty() || Results.empty()) &&
15788 "Lowering trunc for vectors requiring different TBL instructions is "
15789 "not supported!");
15790 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15791 // registers
15792 if (!Parts.empty()) {
15793 Intrinsic::ID TblID;
15794 switch (Parts.size()) {
15795 case 1:
15796 TblID = Intrinsic::aarch64_neon_tbl1;
15797 break;
15798 case 2:
15799 TblID = Intrinsic::aarch64_neon_tbl2;
15800 break;
15801 case 3:
15802 TblID = Intrinsic::aarch64_neon_tbl3;
15803 break;
15804 }
15805
15806 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15807 Parts.push_back(ConstantVector::get(MaskConst));
15808 Results.push_back(Builder.CreateCall(F, Parts));
15809 }
15810
15811 // Extract the destination vector from TBL result(s) after combining them
15812 // where applicable. Currently, at most two TBLs are supported.
15813 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15814 "more than 2 tbl instructions!");
15815 Value *FinalResult = Results[0];
15816 if (Results.size() == 1) {
15817 if (ElemsPerTbl < 16) {
15818 SmallVector<int> FinalMask(ElemsPerTbl);
15819 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15820 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15821 }
15822 } else {
15823 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15824 if (ElemsPerTbl < 16) {
15825 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15826 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15827 } else {
15828 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15829 }
15830 FinalResult =
15831 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15832 }
15833
15834 TI->replaceAllUsesWith(FinalResult);
15835 TI->eraseFromParent();
15836}
15837
15839 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15840 // shuffle_vector instructions are serialized when targeting SVE,
15841 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15842 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15843 return false;
15844
15845 // Try to optimize conversions using tbl. This requires materializing constant
15846 // index vectors, which can increase code size and add loads. Skip the
15847 // transform unless the conversion is in a loop block guaranteed to execute
15848 // and we are not optimizing for size.
15849 Function *F = I->getParent()->getParent();
15850 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15851 F->hasOptSize())
15852 return false;
15853
15854 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15855 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15856 if (!SrcTy || !DstTy)
15857 return false;
15858
15859 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15860 // lowered to tbl instructions to insert the original i8 elements
15861 // into i8x lanes. This is enabled for cases where it is beneficial.
15862 auto *ZExt = dyn_cast<ZExtInst>(I);
15863 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15864 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15865 if (DstWidth % 8 != 0)
15866 return false;
15867
15868 auto *TruncDstType =
15869 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15870 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15871 // the remaining ZExt folded into the user, don't use tbl lowering.
15872 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15873 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15876 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15877 return false;
15878
15879 DstTy = TruncDstType;
15880 }
15881
15882 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15883 }
15884
15885 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15886 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15887 DstTy->getElementType()->isFloatTy()) {
15888 IRBuilder<> Builder(I);
15889 auto *ZExt = cast<ZExtInst>(
15890 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15891 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15892 I->replaceAllUsesWith(UI);
15893 I->eraseFromParent();
15894 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15895 Subtarget->isLittleEndian());
15896 }
15897
15898 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15899 // followed by a truncate lowered to using tbl.4.
15900 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15901 if (FPToUI &&
15902 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15903 SrcTy->getElementType()->isFloatTy() &&
15904 DstTy->getElementType()->isIntegerTy(8)) {
15905 IRBuilder<> Builder(I);
15906 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15907 VectorType::getInteger(SrcTy));
15908 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15909 I->replaceAllUsesWith(TruncI);
15910 I->eraseFromParent();
15911 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15912 return true;
15913 }
15914
15915 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15916 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15917 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15918 // registers
15919 auto *TI = dyn_cast<TruncInst>(I);
15920 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15921 ((SrcTy->getElementType()->isIntegerTy(32) ||
15922 SrcTy->getElementType()->isIntegerTy(64)) &&
15923 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15924 createTblForTrunc(TI, Subtarget->isLittleEndian());
15925 return true;
15926 }
15927
15928 return false;
15929}
15930
15932 Align &RequiredAligment) const {
15933 if (!LoadedType.isSimple() ||
15934 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15935 return false;
15936 // Cyclone supports unaligned accesses.
15937 RequiredAligment = Align(1);
15938 unsigned NumBits = LoadedType.getSizeInBits();
15939 return NumBits == 32 || NumBits == 64;
15940}
15941
15942/// A helper function for determining the number of interleaved accesses we
15943/// will generate when lowering accesses of the given type.
15945 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15946 unsigned VecSize = 128;
15947 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15948 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15949 if (UseScalable)
15950 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15951 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15952}
15953
15956 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15957 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15958 return MOStridedAccess;
15960}
15961
15963 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15964 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15965 auto EC = VecTy->getElementCount();
15966 unsigned MinElts = EC.getKnownMinValue();
15967
15968 UseScalable = false;
15969
15970 if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15971 return false;
15972
15973 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15974 return false;
15975
15976 // Ensure that the predicate for this number of elements is available.
15977 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15978 return false;
15979
15980 // Ensure the number of vector elements is greater than 1.
15981 if (MinElts < 2)
15982 return false;
15983
15984 // Ensure the element type is legal.
15985 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15986 return false;
15987
15988 if (EC.isScalable()) {
15989 UseScalable = true;
15990 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
15991 }
15992
15993 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
15994 if (!Subtarget->isNeonAvailable() ||
15995 (Subtarget->useSVEForFixedLengthVectors() &&
15996 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
15997 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15998 isPowerOf2_32(MinElts) && VecSize > 128)))) {
15999 UseScalable = true;
16000 return true;
16001 }
16002
16003 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16004 // 128 will be split into multiple interleaved accesses.
16005 return VecSize == 64 || VecSize % 128 == 0;
16006}
16007
16009 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16010 return ScalableVectorType::get(VTy->getElementType(), 2);
16011
16012 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16013 return ScalableVectorType::get(VTy->getElementType(), 4);
16014
16015 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16016 return ScalableVectorType::get(VTy->getElementType(), 8);
16017
16018 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16019 return ScalableVectorType::get(VTy->getElementType(), 8);
16020
16021 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16022 return ScalableVectorType::get(VTy->getElementType(), 2);
16023
16024 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16025 return ScalableVectorType::get(VTy->getElementType(), 4);
16026
16027 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16028 return ScalableVectorType::get(VTy->getElementType(), 8);
16029
16030 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16031 return ScalableVectorType::get(VTy->getElementType(), 16);
16032
16033 llvm_unreachable("Cannot handle input vector type");
16034}
16035
16036static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16037 bool Scalable, Type *LDVTy,
16038 Type *PtrTy) {
16039 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16040 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16041 Intrinsic::aarch64_sve_ld3_sret,
16042 Intrinsic::aarch64_sve_ld4_sret};
16043 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16044 Intrinsic::aarch64_neon_ld3,
16045 Intrinsic::aarch64_neon_ld4};
16046 if (Scalable)
16047 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16048
16049 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16050}
16051
16052static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16053 bool Scalable, Type *STVTy,
16054 Type *PtrTy) {
16055 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16056 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16057 Intrinsic::aarch64_sve_st3,
16058 Intrinsic::aarch64_sve_st4};
16059 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16060 Intrinsic::aarch64_neon_st3,
16061 Intrinsic::aarch64_neon_st4};
16062 if (Scalable)
16063 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16064
16065 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16066}
16067
16068/// Lower an interleaved load into a ldN intrinsic.
16069///
16070/// E.g. Lower an interleaved load (Factor = 2):
16071/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16072/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16073/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16074///
16075/// Into:
16076/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16077/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16078/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16081 ArrayRef<unsigned> Indices, unsigned Factor) const {
16082 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16083 "Invalid interleave factor");
16084 assert(!Shuffles.empty() && "Empty shufflevector input");
16085 assert(Shuffles.size() == Indices.size() &&
16086 "Unmatched number of shufflevectors and indices");
16087
16088 const DataLayout &DL = LI->getModule()->getDataLayout();
16089
16090 VectorType *VTy = Shuffles[0]->getType();
16091
16092 // Skip if we do not have NEON and skip illegal vector types. We can
16093 // "legalize" wide vector types into multiple interleaved accesses as long as
16094 // the vector types are divisible by 128.
16095 bool UseScalable;
16096 if (!Subtarget->hasNEON() ||
16097 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
16098 return false;
16099
16100 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16101
16102 auto *FVTy = cast<FixedVectorType>(VTy);
16103
16104 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16105 // load integer vectors first and then convert to pointer vectors.
16106 Type *EltTy = FVTy->getElementType();
16107 if (EltTy->isPointerTy())
16108 FVTy =
16109 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16110
16111 // If we're going to generate more than one load, reset the sub-vector type
16112 // to something legal.
16113 FVTy = FixedVectorType::get(FVTy->getElementType(),
16114 FVTy->getNumElements() / NumLoads);
16115
16116 auto *LDVTy =
16117 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16118
16119 IRBuilder<> Builder(LI);
16120
16121 // The base address of the load.
16122 Value *BaseAddr = LI->getPointerOperand();
16123
16124 Type *PtrTy = LI->getPointerOperandType();
16125 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16126 LDVTy->getElementCount());
16127
16128 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16129 UseScalable, LDVTy, PtrTy);
16130
16131 // Holds sub-vectors extracted from the load intrinsic return values. The
16132 // sub-vectors are associated with the shufflevector instructions they will
16133 // replace.
16135
16136 Value *PTrue = nullptr;
16137 if (UseScalable) {
16138 std::optional<unsigned> PgPattern =
16139 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16140 if (Subtarget->getMinSVEVectorSizeInBits() ==
16141 Subtarget->getMaxSVEVectorSizeInBits() &&
16142 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16143 PgPattern = AArch64SVEPredPattern::all;
16144
16145 auto *PTruePat =
16146 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16147 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16148 {PTruePat});
16149 }
16150
16151 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16152
16153 // If we're generating more than one load, compute the base address of
16154 // subsequent loads as an offset from the previous.
16155 if (LoadCount > 0)
16156 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16157 FVTy->getNumElements() * Factor);
16158
16159 CallInst *LdN;
16160 if (UseScalable)
16161 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16162 else
16163 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16164
16165 // Extract and store the sub-vectors returned by the load intrinsic.
16166 for (unsigned i = 0; i < Shuffles.size(); i++) {
16167 ShuffleVectorInst *SVI = Shuffles[i];
16168 unsigned Index = Indices[i];
16169
16170 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16171
16172 if (UseScalable)
16173 SubVec = Builder.CreateExtractVector(
16174 FVTy, SubVec,
16175 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16176
16177 // Convert the integer vector to pointer vector if the element is pointer.
16178 if (EltTy->isPointerTy())
16179 SubVec = Builder.CreateIntToPtr(
16181 FVTy->getNumElements()));
16182
16183 SubVecs[SVI].push_back(SubVec);
16184 }
16185 }
16186
16187 // Replace uses of the shufflevector instructions with the sub-vectors
16188 // returned by the load intrinsic. If a shufflevector instruction is
16189 // associated with more than one sub-vector, those sub-vectors will be
16190 // concatenated into a single wide vector.
16191 for (ShuffleVectorInst *SVI : Shuffles) {
16192 auto &SubVec = SubVecs[SVI];
16193 auto *WideVec =
16194 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16195 SVI->replaceAllUsesWith(WideVec);
16196 }
16197
16198 return true;
16199}
16200
16201template <typename Iter>
16202bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16203 int MaxLookupDist = 20;
16204 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16205 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16206 const Value *PtrA1 =
16207 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16208
16209 while (++It != End) {
16210 if (It->isDebugOrPseudoInst())
16211 continue;
16212 if (MaxLookupDist-- == 0)
16213 break;
16214 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16215 const Value *PtrB1 =
16216 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16217 DL, OffsetB);
16218 if (PtrA1 == PtrB1 &&
16219 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16220 .abs() == 16)
16221 return true;
16222 }
16223 }
16224
16225 return false;
16226}
16227
16228/// Lower an interleaved store into a stN intrinsic.
16229///
16230/// E.g. Lower an interleaved store (Factor = 3):
16231/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16232/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16233/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16234///
16235/// Into:
16236/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16237/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16238/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16239/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16240///
16241/// Note that the new shufflevectors will be removed and we'll only generate one
16242/// st3 instruction in CodeGen.
16243///
16244/// Example for a more general valid mask (Factor 3). Lower:
16245/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16246/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16247/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16248///
16249/// Into:
16250/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16251/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16252/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16253/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16255 ShuffleVectorInst *SVI,
16256 unsigned Factor) const {
16257
16258 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16259 "Invalid interleave factor");
16260
16261 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16262 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16263
16264 unsigned LaneLen = VecTy->getNumElements() / Factor;
16265 Type *EltTy = VecTy->getElementType();
16266 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16267
16268 const DataLayout &DL = SI->getModule()->getDataLayout();
16269 bool UseScalable;
16270
16271 // Skip if we do not have NEON and skip illegal vector types. We can
16272 // "legalize" wide vector types into multiple interleaved accesses as long as
16273 // the vector types are divisible by 128.
16274 if (!Subtarget->hasNEON() ||
16275 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16276 return false;
16277
16278 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16279
16280 Value *Op0 = SVI->getOperand(0);
16281 Value *Op1 = SVI->getOperand(1);
16282 IRBuilder<> Builder(SI);
16283
16284 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16285 // vectors to integer vectors.
16286 if (EltTy->isPointerTy()) {
16287 Type *IntTy = DL.getIntPtrType(EltTy);
16288 unsigned NumOpElts =
16289 cast<FixedVectorType>(Op0->getType())->getNumElements();
16290
16291 // Convert to the corresponding integer vector.
16292 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16293 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16294 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16295
16296 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16297 }
16298
16299 // If we're going to generate more than one store, reset the lane length
16300 // and sub-vector type to something legal.
16301 LaneLen /= NumStores;
16302 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16303
16304 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16305 : SubVecTy;
16306
16307 // The base address of the store.
16308 Value *BaseAddr = SI->getPointerOperand();
16309
16310 auto Mask = SVI->getShuffleMask();
16311
16312 // Sanity check if all the indices are NOT in range.
16313 // If mask is `poison`, `Mask` may be a vector of -1s.
16314 // If all of them are `poison`, OOB read will happen later.
16315 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16316 return false;
16317 }
16318 // A 64bit st2 which does not start at element 0 will involved adding extra
16319 // ext elements making the st2 unprofitable, and if there is a nearby store
16320 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16321 // zip;ldp pair which has higher throughput.
16322 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16323 (Mask[0] != 0 ||
16324 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16325 DL) ||
16326 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16327 BaseAddr, DL)))
16328 return false;
16329
16330 Type *PtrTy = SI->getPointerOperandType();
16331 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16332 STVTy->getElementCount());
16333
16334 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16335 UseScalable, STVTy, PtrTy);
16336
16337 Value *PTrue = nullptr;
16338 if (UseScalable) {
16339 std::optional<unsigned> PgPattern =
16340 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16341 if (Subtarget->getMinSVEVectorSizeInBits() ==
16342 Subtarget->getMaxSVEVectorSizeInBits() &&
16343 Subtarget->getMinSVEVectorSizeInBits() ==
16344 DL.getTypeSizeInBits(SubVecTy))
16345 PgPattern = AArch64SVEPredPattern::all;
16346
16347 auto *PTruePat =
16348 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16349 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16350 {PTruePat});
16351 }
16352
16353 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16354
16356
16357 // Split the shufflevector operands into sub vectors for the new stN call.
16358 for (unsigned i = 0; i < Factor; i++) {
16359 Value *Shuffle;
16360 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16361 if (Mask[IdxI] >= 0) {
16362 Shuffle = Builder.CreateShuffleVector(
16363 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16364 } else {
16365 unsigned StartMask = 0;
16366 for (unsigned j = 1; j < LaneLen; j++) {
16367 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16368 if (Mask[IdxJ] >= 0) {
16369 StartMask = Mask[IdxJ] - j;
16370 break;
16371 }
16372 }
16373 // Note: Filling undef gaps with random elements is ok, since
16374 // those elements were being written anyway (with undefs).
16375 // In the case of all undefs we're defaulting to using elems from 0
16376 // Note: StartMask cannot be negative, it's checked in
16377 // isReInterleaveMask
16378 Shuffle = Builder.CreateShuffleVector(
16379 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16380 }
16381
16382 if (UseScalable)
16383 Shuffle = Builder.CreateInsertVector(
16384 STVTy, UndefValue::get(STVTy), Shuffle,
16385 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16386
16387 Ops.push_back(Shuffle);
16388 }
16389
16390 if (UseScalable)
16391 Ops.push_back(PTrue);
16392
16393 // If we generating more than one store, we compute the base address of
16394 // subsequent stores as an offset from the previous.
16395 if (StoreCount > 0)
16396 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16397 BaseAddr, LaneLen * Factor);
16398
16399 Ops.push_back(BaseAddr);
16400 Builder.CreateCall(StNFunc, Ops);
16401 }
16402 return true;
16403}
16404
16406 IntrinsicInst *DI, LoadInst *LI) const {
16407 // Only deinterleave2 supported at present.
16408 if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
16409 return false;
16410
16411 // Only a factor of 2 supported at present.
16412 const unsigned Factor = 2;
16413
16414 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16415 const DataLayout &DL = DI->getModule()->getDataLayout();
16416 bool UseScalable;
16417 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16418 return false;
16419
16420 // TODO: Add support for using SVE instructions with fixed types later, using
16421 // the code from lowerInterleavedLoad to obtain the correct container type.
16422 if (UseScalable && !VTy->isScalableTy())
16423 return false;
16424
16425 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16426
16427 VectorType *LdTy =
16429 VTy->getElementCount().divideCoefficientBy(NumLoads));
16430
16431 Type *PtrTy = LI->getPointerOperandType();
16432 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16433 UseScalable, LdTy, PtrTy);
16434
16435 IRBuilder<> Builder(LI);
16436
16437 Value *Pred = nullptr;
16438 if (UseScalable)
16439 Pred =
16440 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16441
16442 Value *BaseAddr = LI->getPointerOperand();
16443 Value *Result;
16444 if (NumLoads > 1) {
16445 Value *Left = PoisonValue::get(VTy);
16447
16448 for (unsigned I = 0; I < NumLoads; ++I) {
16449 Value *Offset = Builder.getInt64(I * Factor);
16450
16451 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16452 Value *LdN = nullptr;
16453 if (UseScalable)
16454 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16455 else
16456 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16457
16458 Value *Idx =
16459 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16460 Left = Builder.CreateInsertVector(
16461 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16462 Right = Builder.CreateInsertVector(
16463 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16464 }
16465
16466 Result = PoisonValue::get(DI->getType());
16467 Result = Builder.CreateInsertValue(Result, Left, 0);
16468 Result = Builder.CreateInsertValue(Result, Right, 1);
16469 } else {
16470 if (UseScalable)
16471 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16472 else
16473 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16474 }
16475
16476 DI->replaceAllUsesWith(Result);
16477 return true;
16478}
16479
16481 IntrinsicInst *II, StoreInst *SI) const {
16482 // Only interleave2 supported at present.
16483 if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16484 return false;
16485
16486 // Only a factor of 2 supported at present.
16487 const unsigned Factor = 2;
16488
16489 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16490 const DataLayout &DL = II->getModule()->getDataLayout();
16491 bool UseScalable;
16492 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16493 return false;
16494
16495 // TODO: Add support for using SVE instructions with fixed types later, using
16496 // the code from lowerInterleavedStore to obtain the correct container type.
16497 if (UseScalable && !VTy->isScalableTy())
16498 return false;
16499
16500 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16501
16502 VectorType *StTy =
16504 VTy->getElementCount().divideCoefficientBy(NumStores));
16505
16506 Type *PtrTy = SI->getPointerOperandType();
16507 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16508 UseScalable, StTy, PtrTy);
16509
16510 IRBuilder<> Builder(SI);
16511
16512 Value *BaseAddr = SI->getPointerOperand();
16513 Value *Pred = nullptr;
16514
16515 if (UseScalable)
16516 Pred =
16517 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16518
16519 Value *L = II->getOperand(0);
16520 Value *R = II->getOperand(1);
16521
16522 for (unsigned I = 0; I < NumStores; ++I) {
16523 Value *Address = BaseAddr;
16524 if (NumStores > 1) {
16525 Value *Offset = Builder.getInt64(I * Factor);
16526 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16527
16528 Value *Idx =
16529 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16530 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16531 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16532 }
16533
16534 if (UseScalable)
16535 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16536 else
16537 Builder.CreateCall(StNFunc, {L, R, Address});
16538 }
16539
16540 return true;
16541}
16542
16544 const MemOp &Op, const AttributeList &FuncAttributes) const {
16545 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16546 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16547 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16548 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16549 // taken one instruction to materialize the v2i64 zero and one store (with
16550 // restrictive addressing mode). Just do i64 stores.
16551 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16552 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16553 if (Op.isAligned(AlignCheck))
16554 return true;
16555 unsigned Fast;
16556 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16558 Fast;
16559 };
16560
16561 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16562 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16563 return MVT::v16i8;
16564 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16565 return MVT::f128;
16566 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16567 return MVT::i64;
16568 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16569 return MVT::i32;
16570 return MVT::Other;
16571}
16572
16574 const MemOp &Op, const AttributeList &FuncAttributes) const {
16575 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16576 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16577 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16578 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16579 // taken one instruction to materialize the v2i64 zero and one store (with
16580 // restrictive addressing mode). Just do i64 stores.
16581 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16582 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16583 if (Op.isAligned(AlignCheck))
16584 return true;
16585 unsigned Fast;
16586 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16588 Fast;
16589 };
16590
16591 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16592 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16593 return LLT::fixed_vector(2, 64);
16594 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16595 return LLT::scalar(128);
16596 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16597 return LLT::scalar(64);
16598 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16599 return LLT::scalar(32);
16600 return LLT();
16601}
16602
16603// 12-bit optionally shifted immediates are legal for adds.
16605 if (Immed == std::numeric_limits<int64_t>::min()) {
16606 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16607 << ": avoid UB for INT64_MIN\n");
16608 return false;
16609 }
16610 // Same encoding for add/sub, just flip the sign.
16611 Immed = std::abs(Immed);
16612 bool IsLegal = ((Immed >> 12) == 0 ||
16613 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16614 LLVM_DEBUG(dbgs() << "Is " << Immed
16615 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16616 return IsLegal;
16617}
16618
16619// Return false to prevent folding
16620// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16621// if the folding leads to worse code.
16623 SDValue AddNode, SDValue ConstNode) const {
16624 // Let the DAGCombiner decide for vector types and large types.
16625 const EVT VT = AddNode.getValueType();
16626 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16627 return true;
16628
16629 // It is worse if c1 is legal add immediate, while c1*c2 is not
16630 // and has to be composed by at least two instructions.
16631 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16632 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16633 const int64_t C1 = C1Node->getSExtValue();
16634 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16636 return true;
16638 // Adapt to the width of a register.
16639 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16641 if (Insn.size() > 1)
16642 return false;
16643
16644 // Default to true and let the DAGCombiner decide.
16645 return true;
16646}
16647
16648// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16649// immediates is the same as for an add or a sub.
16651 return isLegalAddImmediate(Immed);
16652}
16653
16654/// isLegalAddressingMode - Return true if the addressing mode represented
16655/// by AM is legal for this target, for a load/store of the specified type.
16657 const AddrMode &AMode, Type *Ty,
16658 unsigned AS, Instruction *I) const {
16659 // AArch64 has five basic addressing modes:
16660 // reg
16661 // reg + 9-bit signed offset
16662 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16663 // reg1 + reg2
16664 // reg + SIZE_IN_BYTES * reg
16665
16666 // No global is ever allowed as a base.
16667 if (AMode.BaseGV)
16668 return false;
16669
16670 // No reg+reg+imm addressing.
16671 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16672 return false;
16673
16674 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16675 // `2*ScaledReg` into `BaseReg + ScaledReg`
16676 AddrMode AM = AMode;
16677 if (AM.Scale && !AM.HasBaseReg) {
16678 if (AM.Scale == 1) {
16679 AM.HasBaseReg = true;
16680 AM.Scale = 0;
16681 } else if (AM.Scale == 2) {
16682 AM.HasBaseReg = true;
16683 AM.Scale = 1;
16684 } else {
16685 return false;
16686 }
16687 }
16688
16689 // A base register is required in all addressing modes.
16690 if (!AM.HasBaseReg)
16691 return false;
16692
16693 if (Ty->isScalableTy()) {
16694 if (isa<ScalableVectorType>(Ty)) {
16695 uint64_t VecElemNumBytes =
16696 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16697 return AM.HasBaseReg && !AM.BaseOffs &&
16698 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16699 }
16700
16701 return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
16702 }
16703
16704 // check reg + imm case:
16705 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16706 uint64_t NumBytes = 0;
16707 if (Ty->isSized()) {
16708 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16709 NumBytes = NumBits / 8;
16710 if (!isPowerOf2_64(NumBits))
16711 NumBytes = 0;
16712 }
16713
16714 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16715 AM.Scale);
16716}
16717
16718// Check whether the 2 offsets belong to the same imm24 range, and their high
16719// 12bits are same, then their high part can be decoded with the offset of add.
16720int64_t
16722 int64_t MaxOffset) const {
16723 int64_t HighPart = MinOffset & ~0xfffULL;
16724 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16725 // Rebase the value to an integer multiple of imm12.
16726 return HighPart;
16727 }
16728
16729 return 0;
16730}
16731
16733 // Consider splitting large offset of struct or array.
16734 return true;
16735}
16736
16738 const MachineFunction &MF, EVT VT) const {
16739 VT = VT.getScalarType();
16740
16741 if (!VT.isSimple())
16742 return false;
16743
16744 switch (VT.getSimpleVT().SimpleTy) {
16745 case MVT::f16:
16746 return Subtarget->hasFullFP16();
16747 case MVT::f32:
16748 case MVT::f64:
16749 return true;
16750 default:
16751 break;
16752 }
16753
16754 return false;
16755}
16756
16758 Type *Ty) const {
16759 switch (Ty->getScalarType()->getTypeID()) {
16760 case Type::FloatTyID:
16761 case Type::DoubleTyID:
16762 return true;
16763 default:
16764 return false;
16765 }
16766}
16767
16769 EVT VT, CodeGenOptLevel OptLevel) const {
16770 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16772}
16773
16774const MCPhysReg *
16776 // LR is a callee-save register, but we must treat it as clobbered by any call
16777 // site. Hence we include LR in the scratch registers, which are in turn added
16778 // as implicit-defs for stackmaps and patchpoints.
16779 static const MCPhysReg ScratchRegs[] = {
16780 AArch64::X16, AArch64::X17, AArch64::LR, 0
16781 };
16782 return ScratchRegs;
16783}
16784
16786 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16787 return RCRegs;
16788}
16789
16790bool
16792 CombineLevel Level) const {
16793 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16794 N->getOpcode() == ISD::SRL) &&
16795 "Expected shift op");
16796
16797 SDValue ShiftLHS = N->getOperand(0);
16798 EVT VT = N->getValueType(0);
16799
16800 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16801 // combine it with shift 'N' to let it be lowered to UBFX except:
16802 // ((x >> C) & mask) << C.
16803 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16804 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16805 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16806 if (isMask_64(TruncMask)) {
16807 SDValue AndLHS = ShiftLHS.getOperand(0);
16808 if (AndLHS.getOpcode() == ISD::SRL) {
16809 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16810 if (N->getOpcode() == ISD::SHL)
16811 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16812 return SRLC->getZExtValue() == SHLC->getZExtValue();
16813 return false;
16814 }
16815 }
16816 }
16817 }
16818 return true;
16819}
16820
16822 const SDNode *N) const {
16823 assert(N->getOpcode() == ISD::XOR &&
16824 (N->getOperand(0).getOpcode() == ISD::SHL ||
16825 N->getOperand(0).getOpcode() == ISD::SRL) &&
16826 "Expected XOR(SHIFT) pattern");
16827
16828 // Only commute if the entire NOT mask is a hidden shifted mask.
16829 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16830 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16831 if (XorC && ShiftC) {
16832 unsigned MaskIdx, MaskLen;
16833 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16834 unsigned ShiftAmt = ShiftC->getZExtValue();
16835 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16836 if (N->getOperand(0).getOpcode() == ISD::SHL)
16837 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16838 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16839 }
16840 }
16841
16842 return false;
16843}
16844
16846 const SDNode *N, CombineLevel Level) const {
16847 assert(((N->getOpcode() == ISD::SHL &&
16848 N->getOperand(0).getOpcode() == ISD::SRL) ||
16849 (N->getOpcode() == ISD::SRL &&
16850 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16851 "Expected shift-shift mask");
16852 // Don't allow multiuse shift folding with the same shift amount.
16853 if (!N->getOperand(0)->hasOneUse())
16854 return false;
16855
16856 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16857 EVT VT = N->getValueType(0);
16858 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16859 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16860 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16861 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16862 }
16863
16864 return true;
16865}
16866
16868 unsigned BinOpcode, EVT VT) const {
16869 return VT.isScalableVector() && isTypeLegal(VT);
16870}
16871
16873 Type *Ty) const {
16874 assert(Ty->isIntegerTy());
16875
16876 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16877 if (BitSize == 0)
16878 return false;
16879
16880 int64_t Val = Imm.getSExtValue();
16881 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16882 return true;
16883
16884 if ((int64_t)Val < 0)
16885 Val = ~Val;
16886 if (BitSize == 32)
16887 Val &= (1LL << 32) - 1;
16888
16889 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16890 // MOVZ is free so return true for one or fewer MOVK.
16891 return Shift < 3;
16892}
16893
16895 unsigned Index) const {
16897 return false;
16898
16899 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16900}
16901
16902/// Turn vector tests of the signbit in the form of:
16903/// xor (sra X, elt_size(X)-1), -1
16904/// into:
16905/// cmge X, X, #0
16907 const AArch64Subtarget *Subtarget) {
16908 EVT VT = N->getValueType(0);
16909 if (!Subtarget->hasNEON() || !VT.isVector())
16910 return SDValue();
16911
16912 // There must be a shift right algebraic before the xor, and the xor must be a
16913 // 'not' operation.
16914 SDValue Shift = N->getOperand(0);
16915 SDValue Ones = N->getOperand(1);
16916 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16918 return SDValue();
16919
16920 // The shift should be smearing the sign bit across each vector element.
16921 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16922 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16923 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16924 return SDValue();
16925
16926 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16927}
16928
16929// Given a vecreduce_add node, detect the below pattern and convert it to the
16930// node sequence with UABDL, [S|U]ADB and UADDLP.
16931//
16932// i32 vecreduce_add(
16933// v16i32 abs(
16934// v16i32 sub(
16935// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16936// =================>
16937// i32 vecreduce_add(
16938// v4i32 UADDLP(
16939// v8i16 add(
16940// v8i16 zext(
16941// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16942// v8i16 zext(
16943// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
16945 SelectionDAG &DAG) {
16946 // Assumed i32 vecreduce_add
16947 if (N->getValueType(0) != MVT::i32)
16948 return SDValue();
16949
16950 SDValue VecReduceOp0 = N->getOperand(0);
16951 unsigned Opcode = VecReduceOp0.getOpcode();
16952 // Assumed v16i32 abs
16953 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
16954 return SDValue();
16955
16956 SDValue ABS = VecReduceOp0;
16957 // Assumed v16i32 sub
16958 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
16959 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
16960 return SDValue();
16961
16962 SDValue SUB = ABS->getOperand(0);
16963 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
16964 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
16965 // Assumed v16i32 type
16966 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
16967 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
16968 return SDValue();
16969
16970 // Assumed zext or sext
16971 bool IsZExt = false;
16972 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
16973 IsZExt = true;
16974 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
16975 IsZExt = false;
16976 } else
16977 return SDValue();
16978
16979 SDValue EXT0 = SUB->getOperand(0);
16980 SDValue EXT1 = SUB->getOperand(1);
16981 // Assumed zext's operand has v16i8 type
16982 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
16983 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
16984 return SDValue();
16985
16986 // Pattern is dectected. Let's convert it to sequence of nodes.
16987 SDLoc DL(N);
16988
16989 // First, create the node pattern of UABD/SABD.
16990 SDValue UABDHigh8Op0 =
16991 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16992 DAG.getConstant(8, DL, MVT::i64));
16993 SDValue UABDHigh8Op1 =
16994 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16995 DAG.getConstant(8, DL, MVT::i64));
16996 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16997 UABDHigh8Op0, UABDHigh8Op1);
16998 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
16999
17000 // Second, create the node pattern of UABAL.
17001 SDValue UABDLo8Op0 =
17002 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17003 DAG.getConstant(0, DL, MVT::i64));
17004 SDValue UABDLo8Op1 =
17005 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17006 DAG.getConstant(0, DL, MVT::i64));
17007 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17008 UABDLo8Op0, UABDLo8Op1);
17009 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17010 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17011
17012 // Third, create the node of UADDLP.
17013 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17014
17015 // Fourth, create the node of VECREDUCE_ADD.
17016 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17017}
17018
17019// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17020// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17021// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17022// If we have vectors larger than v16i8 we extract v16i8 vectors,
17023// Follow the same steps above to get DOT instructions concatenate them
17024// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17026 const AArch64Subtarget *ST) {
17027 if (!ST->hasDotProd())
17029
17030 SDValue Op0 = N->getOperand(0);
17031 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17032 Op0.getValueType().getVectorElementType() != MVT::i32)
17033 return SDValue();
17034
17035 unsigned ExtOpcode = Op0.getOpcode();
17036 SDValue A = Op0;
17037 SDValue B;
17038 if (ExtOpcode == ISD::MUL) {
17039 A = Op0.getOperand(0);
17040 B = Op0.getOperand(1);
17041 if (A.getOpcode() != B.getOpcode() ||
17042 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17043 return SDValue();
17044 ExtOpcode = A.getOpcode();
17045 }
17046 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17047 return SDValue();
17048
17049 EVT Op0VT = A.getOperand(0).getValueType();
17050 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17051 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17052 if (!IsValidElementCount || !IsValidSize)
17053 return SDValue();
17054
17055 SDLoc DL(Op0);
17056 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17057 // the extend B.
17058 if (!B)
17059 B = DAG.getConstant(1, DL, Op0VT);
17060 else
17061 B = B.getOperand(0);
17062
17063 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17064 unsigned NumOfVecReduce;
17065 EVT TargetType;
17066 if (IsMultipleOf16) {
17067 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17068 TargetType = MVT::v4i32;
17069 } else {
17070 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17071 TargetType = MVT::v2i32;
17072 }
17073 auto DotOpcode =
17075 // Handle the case where we need to generate only one Dot operation.
17076 if (NumOfVecReduce == 1) {
17077 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17078 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17079 A.getOperand(0), B);
17080 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17081 }
17082 // Generate Dot instructions that are multiple of 16.
17083 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17084 SmallVector<SDValue, 4> SDotVec16;
17085 unsigned I = 0;
17086 for (; I < VecReduce16Num; I += 1) {
17087 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17088 SDValue Op0 =
17089 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17090 DAG.getConstant(I * 16, DL, MVT::i64));
17091 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17092 DAG.getConstant(I * 16, DL, MVT::i64));
17093 SDValue Dot =
17094 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17095 SDotVec16.push_back(Dot);
17096 }
17097 // Concatenate dot operations.
17098 EVT SDot16EVT =
17099 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17100 SDValue ConcatSDot16 =
17101 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17102 SDValue VecReduceAdd16 =
17103 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17104 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17105 if (VecReduce8Num == 0)
17106 return VecReduceAdd16;
17107
17108 // Generate the remainder Dot operation that is multiple of 8.
17109 SmallVector<SDValue, 4> SDotVec8;
17110 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17111 SDValue Vec8Op0 =
17112 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17113 DAG.getConstant(I * 16, DL, MVT::i64));
17114 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17115 DAG.getConstant(I * 16, DL, MVT::i64));
17116 SDValue Dot =
17117 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17118 SDValue VecReudceAdd8 =
17119 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17120 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17121 VecReudceAdd8);
17122}
17123
17124// Given an (integer) vecreduce, we know the order of the inputs does not
17125// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17126// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17127// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17129 auto DetectAddExtract = [&](SDValue A) {
17130 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17131 // UADDLP(x) if found.
17132 assert(A.getOpcode() == ISD::ADD);
17133 EVT VT = A.getValueType();
17134 SDValue Op0 = A.getOperand(0);
17135 SDValue Op1 = A.getOperand(1);
17136 if (Op0.getOpcode() != Op0.getOpcode() ||
17137 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17138 Op0.getOpcode() != ISD::SIGN_EXTEND))
17139 return SDValue();
17140 SDValue Ext0 = Op0.getOperand(0);
17141 SDValue Ext1 = Op1.getOperand(0);
17142 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17144 Ext0.getOperand(0) != Ext1.getOperand(0))
17145 return SDValue();
17146 // Check that the type is twice the add types, and the extract are from
17147 // upper/lower parts of the same source.
17149 VT.getVectorNumElements() * 2)
17150 return SDValue();
17151 if ((Ext0.getConstantOperandVal(1) != 0 ||
17153 (Ext1.getConstantOperandVal(1) != 0 ||
17155 return SDValue();
17156 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17158 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17159 };
17160
17161 if (SDValue R = DetectAddExtract(A))
17162 return R;
17163
17164 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17165 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17166 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17167 A.getOperand(1));
17168 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17169 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17170 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17171 A.getOperand(0));
17172 return SDValue();
17173}
17174
17175// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17176// UADDLV(concat), where the concat represents the 64-bit zext sources.
17178 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17179 // UADDLV(concat(zext, zext)) if found.
17180 assert(A.getOpcode() == ISD::ADD);
17181 EVT VT = A.getValueType();
17182 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17183 return SDValue();
17184 SDValue Op0 = A.getOperand(0);
17185 SDValue Op1 = A.getOperand(1);
17186 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17187 return SDValue();
17188 SDValue Ext0 = Op0.getOperand(0);
17189 SDValue Ext1 = Op1.getOperand(0);
17190 EVT ExtVT0 = Ext0.getValueType();
17191 EVT ExtVT1 = Ext1.getValueType();
17192 // Check zext VTs are the same and 64-bit length.
17193 if (ExtVT0 != ExtVT1 ||
17194 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17195 return SDValue();
17196 // Get VT for concat of zext sources.
17197 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17198 SDValue Concat =
17199 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17200
17201 switch (VT.getSimpleVT().SimpleTy) {
17202 case MVT::v2i64:
17203 case MVT::v4i32:
17204 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17205 case MVT::v8i16: {
17206 SDValue Uaddlv =
17207 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17208 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17209 }
17210 default:
17211 llvm_unreachable("Unhandled vector type");
17212 }
17213}
17214
17216 SDValue A = N->getOperand(0);
17217 if (A.getOpcode() == ISD::ADD) {
17218 if (SDValue R = performUADDVAddCombine(A, DAG))
17219 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17220 else if (SDValue R = performUADDVZextCombine(A, DAG))
17221 return R;
17222 }
17223 return SDValue();
17224}
17225
17228 const AArch64Subtarget *Subtarget) {
17229 if (DCI.isBeforeLegalizeOps())
17230 return SDValue();
17231
17232 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17233}
17234
17235SDValue
17236AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17237 SelectionDAG &DAG,
17238 SmallVectorImpl<SDNode *> &Created) const {
17240 if (isIntDivCheap(N->getValueType(0), Attr))
17241 return SDValue(N,0); // Lower SDIV as SDIV
17242
17243 EVT VT = N->getValueType(0);
17244
17245 // For scalable and fixed types, mark them as cheap so we can handle it much
17246 // later. This allows us to handle larger than legal types.
17247 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17248 return SDValue(N, 0);
17249
17250 // fold (sdiv X, pow2)
17251 if ((VT != MVT::i32 && VT != MVT::i64) ||
17252 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17253 return SDValue();
17254
17255 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17256}
17257
17258SDValue
17259AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17260 SelectionDAG &DAG,
17261 SmallVectorImpl<SDNode *> &Created) const {
17263 if (isIntDivCheap(N->getValueType(0), Attr))
17264 return SDValue(N, 0); // Lower SREM as SREM
17265
17266 EVT VT = N->getValueType(0);
17267
17268 // For scalable and fixed types, mark them as cheap so we can handle it much
17269 // later. This allows us to handle larger than legal types.
17270 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17271 return SDValue(N, 0);
17272
17273 // fold (srem X, pow2)
17274 if ((VT != MVT::i32 && VT != MVT::i64) ||
17275 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17276 return SDValue();
17277
17278 unsigned Lg2 = Divisor.countr_zero();
17279 if (Lg2 == 0)
17280 return SDValue();
17281
17282 SDLoc DL(N);
17283 SDValue N0 = N->getOperand(0);
17284 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17285 SDValue Zero = DAG.getConstant(0, DL, VT);
17286 SDValue CCVal, CSNeg;
17287 if (Lg2 == 1) {
17288 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17289 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17290 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17291
17292 Created.push_back(Cmp.getNode());
17293 Created.push_back(And.getNode());
17294 } else {
17295 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17296 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17297
17298 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17299 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17300 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17301 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17302 Negs.getValue(1));
17303
17304 Created.push_back(Negs.getNode());
17305 Created.push_back(AndPos.getNode());
17306 Created.push_back(AndNeg.getNode());
17307 }
17308
17309 return CSNeg;
17310}
17311
17312static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17313 switch(getIntrinsicID(S.getNode())) {
17314 default:
17315 break;
17316 case Intrinsic::aarch64_sve_cntb:
17317 return 8;
17318 case Intrinsic::aarch64_sve_cnth:
17319 return 16;
17320 case Intrinsic::aarch64_sve_cntw:
17321 return 32;
17322 case Intrinsic::aarch64_sve_cntd:
17323 return 64;
17324 }
17325 return {};
17326}
17327
17328/// Calculates what the pre-extend type is, based on the extension
17329/// operation node provided by \p Extend.
17330///
17331/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17332/// pre-extend type is pulled directly from the operand, while other extend
17333/// operations need a bit more inspection to get this information.
17334///
17335/// \param Extend The SDNode from the DAG that represents the extend operation
17336///
17337/// \returns The type representing the \p Extend source type, or \p MVT::Other
17338/// if no valid type can be determined
17340 switch (Extend.getOpcode()) {
17341 case ISD::SIGN_EXTEND:
17342 case ISD::ZERO_EXTEND:
17343 return Extend.getOperand(0).getValueType();
17344 case ISD::AssertSext:
17345 case ISD::AssertZext:
17347 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17348 if (!TypeNode)
17349 return MVT::Other;
17350 return TypeNode->getVT();
17351 }
17352 case ISD::AND: {
17354 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17355 if (!Constant)
17356 return MVT::Other;
17357
17358 uint32_t Mask = Constant->getZExtValue();
17359
17360 if (Mask == UCHAR_MAX)
17361 return MVT::i8;
17362 else if (Mask == USHRT_MAX)
17363 return MVT::i16;
17364 else if (Mask == UINT_MAX)
17365 return MVT::i32;
17366
17367 return MVT::Other;
17368 }
17369 default:
17370 return MVT::Other;
17371 }
17372}
17373
17374/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17375/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17376/// SExt/ZExt rather than the scalar SExt/ZExt
17378 EVT VT = BV.getValueType();
17379 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17381 return SDValue();
17382
17383 // Use the first item in the buildvector/shuffle to get the size of the
17384 // extend, and make sure it looks valid.
17385 SDValue Extend = BV->getOperand(0);
17386 unsigned ExtendOpcode = Extend.getOpcode();
17387 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17388 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17389 ExtendOpcode == ISD::AssertSext;
17390 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17391 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17392 return SDValue();
17393 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17394 // calculatePreExtendType will work without issue.
17395 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17396 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17397 return SDValue();
17398
17399 // Restrict valid pre-extend data type
17400 EVT PreExtendType = calculatePreExtendType(Extend);
17401 if (PreExtendType == MVT::Other ||
17402 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17403 return SDValue();
17404
17405 // Make sure all other operands are equally extended
17406 for (SDValue Op : drop_begin(BV->ops())) {
17407 if (Op.isUndef())
17408 continue;
17409 unsigned Opc = Op.getOpcode();
17410 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17411 Opc == ISD::AssertSext;
17412 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17413 return SDValue();
17414 }
17415
17416 SDValue NBV;
17417 SDLoc DL(BV);
17418 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17419 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17420 EVT PreExtendLegalType =
17421 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17423 for (SDValue Op : BV->ops())
17424 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17425 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17426 PreExtendLegalType));
17427 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17428 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17429 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17430 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17431 BV.getOperand(1).isUndef()
17432 ? DAG.getUNDEF(PreExtendVT)
17433 : BV.getOperand(1).getOperand(0),
17434 cast<ShuffleVectorSDNode>(BV)->getMask());
17435 }
17436 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17437}
17438
17439/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17440/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17442 // If the value type isn't a vector, none of the operands are going to be dups
17443 EVT VT = Mul->getValueType(0);
17444 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17445 return SDValue();
17446
17447 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17448 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17449
17450 // Neither operands have been changed, don't make any further changes
17451 if (!Op0 && !Op1)
17452 return SDValue();
17453
17454 SDLoc DL(Mul);
17455 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17456 Op1 ? Op1 : Mul->getOperand(1));
17457}
17458
17459// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17460// Same for other types with equivalent constants.
17462 EVT VT = N->getValueType(0);
17463 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17464 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17465 return SDValue();
17466 if (N->getOperand(0).getOpcode() != ISD::AND ||
17467 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17468 return SDValue();
17469
17470 SDValue And = N->getOperand(0);
17471 SDValue Srl = And.getOperand(0);
17472
17473 APInt V1, V2, V3;
17474 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17475 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17477 return SDValue();
17478
17479 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17480 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17481 V3 != (HalfSize - 1))
17482 return SDValue();
17483
17484 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17485 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17486 VT.getVectorElementCount() * 2);
17487
17488 SDLoc DL(N);
17489 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17490 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17491 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17492}
17493
17496 const AArch64Subtarget *Subtarget) {
17497
17498 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17499 return Ext;
17501 return Ext;
17502
17503 if (DCI.isBeforeLegalizeOps())
17504 return SDValue();
17505
17506 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17507 // and in MachineCombiner pass, add+mul will be combined into madd.
17508 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17509 SDLoc DL(N);
17510 EVT VT = N->getValueType(0);
17511 SDValue N0 = N->getOperand(0);
17512 SDValue N1 = N->getOperand(1);
17513 SDValue MulOper;
17514 unsigned AddSubOpc;
17515
17516 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17517 AddSubOpc = V->getOpcode();
17518 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17519 SDValue Opnd = V->getOperand(1);
17520 MulOper = V->getOperand(0);
17521 if (AddSubOpc == ISD::SUB)
17522 std::swap(Opnd, MulOper);
17523 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17524 return C->isOne();
17525 }
17526 return false;
17527 };
17528
17529 if (IsAddSubWith1(N0)) {
17530 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17531 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17532 }
17533
17534 if (IsAddSubWith1(N1)) {
17535 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17536 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17537 }
17538
17539 // The below optimizations require a constant RHS.
17540 if (!isa<ConstantSDNode>(N1))
17541 return SDValue();
17542
17543 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17544 const APInt &ConstValue = C->getAPIntValue();
17545
17546 // Allow the scaling to be folded into the `cnt` instruction by preventing
17547 // the scaling to be obscured here. This makes it easier to pattern match.
17548 if (IsSVECntIntrinsic(N0) ||
17549 (N0->getOpcode() == ISD::TRUNCATE &&
17550 (IsSVECntIntrinsic(N0->getOperand(0)))))
17551 if (ConstValue.sge(1) && ConstValue.sle(16))
17552 return SDValue();
17553
17554 // Multiplication of a power of two plus/minus one can be done more
17555 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17556 // future CPUs have a cheaper MADD instruction, this may need to be
17557 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17558 // 64-bit is 5 cycles, so this is always a win.
17559 // More aggressively, some multiplications N0 * C can be lowered to
17560 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17561 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17562 // TODO: lower more cases.
17563
17564 // TrailingZeroes is used to test if the mul can be lowered to
17565 // shift+add+shift.
17566 unsigned TrailingZeroes = ConstValue.countr_zero();
17567 if (TrailingZeroes) {
17568 // Conservatively do not lower to shift+add+shift if the mul might be
17569 // folded into smul or umul.
17570 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17571 isZeroExtended(N0, DAG)))
17572 return SDValue();
17573 // Conservatively do not lower to shift+add+shift if the mul might be
17574 // folded into madd or msub.
17575 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17576 N->use_begin()->getOpcode() == ISD::SUB))
17577 return SDValue();
17578 }
17579 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17580 // and shift+add+shift.
17581 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17582 unsigned ShiftAmt;
17583
17584 auto Shl = [&](SDValue N0, unsigned N1) {
17585 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17586 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17587 };
17588 auto Add = [&](SDValue N0, SDValue N1) {
17589 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17590 };
17591 auto Sub = [&](SDValue N0, SDValue N1) {
17592 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17593 };
17594 auto Negate = [&](SDValue N) {
17595 SDValue Zero = DAG.getConstant(0, DL, VT);
17596 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17597 };
17598
17599 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17600 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17601 // the (2^N - 1) can't be execused via a single instruction.
17602 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17603 unsigned BitWidth = C.getBitWidth();
17604 for (unsigned i = 1; i < BitWidth / 2; i++) {
17605 APInt Rem;
17606 APInt X(BitWidth, (1 << i) + 1);
17607 APInt::sdivrem(C, X, N, Rem);
17608 APInt NVMinus1 = N - 1;
17609 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17610 M = X;
17611 return true;
17612 }
17613 }
17614 return false;
17615 };
17616
17617 if (ConstValue.isNonNegative()) {
17618 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17619 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17620 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17621 // (mul x, (2^M + 1) * (2^N + 1))
17622 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17623 APInt SCVMinus1 = ShiftedConstValue - 1;
17624 APInt SCVPlus1 = ShiftedConstValue + 1;
17625 APInt CVPlus1 = ConstValue + 1;
17626 APInt CVM, CVN;
17627 if (SCVMinus1.isPowerOf2()) {
17628 ShiftAmt = SCVMinus1.logBase2();
17629 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17630 } else if (CVPlus1.isPowerOf2()) {
17631 ShiftAmt = CVPlus1.logBase2();
17632 return Sub(Shl(N0, ShiftAmt), N0);
17633 } else if (SCVPlus1.isPowerOf2()) {
17634 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17635 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17636 } else if (Subtarget->hasALULSLFast() &&
17637 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17638 APInt CVMMinus1 = CVM - 1;
17639 APInt CVNMinus1 = CVN - 1;
17640 unsigned ShiftM1 = CVMMinus1.logBase2();
17641 unsigned ShiftN1 = CVNMinus1.logBase2();
17642 // LSLFast implicate that Shifts <= 3 places are fast
17643 if (ShiftM1 <= 3 && ShiftN1 <= 3) {
17644 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17645 return Add(Shl(MVal, ShiftN1), MVal);
17646 }
17647 }
17648 } else {
17649 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17650 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17651 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17652 APInt SCVPlus1 = -ShiftedConstValue + 1;
17653 APInt CVNegPlus1 = -ConstValue + 1;
17654 APInt CVNegMinus1 = -ConstValue - 1;
17655 if (CVNegPlus1.isPowerOf2()) {
17656 ShiftAmt = CVNegPlus1.logBase2();
17657 return Sub(N0, Shl(N0, ShiftAmt));
17658 } else if (CVNegMinus1.isPowerOf2()) {
17659 ShiftAmt = CVNegMinus1.logBase2();
17660 return Negate(Add(Shl(N0, ShiftAmt), N0));
17661 } else if (SCVPlus1.isPowerOf2()) {
17662 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17663 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17664 }
17665 }
17666
17667 return SDValue();
17668}
17669
17671 SelectionDAG &DAG) {
17672 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17673 // optimize away operation when it's from a constant.
17674 //
17675 // The general transformation is:
17676 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17677 // AND(VECTOR_CMP(x,y), constant2)
17678 // constant2 = UNARYOP(constant)
17679
17680 // Early exit if this isn't a vector operation, the operand of the
17681 // unary operation isn't a bitwise AND, or if the sizes of the operations
17682 // aren't the same.
17683 EVT VT = N->getValueType(0);
17684 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17685 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17686 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17687 return SDValue();
17688
17689 // Now check that the other operand of the AND is a constant. We could
17690 // make the transformation for non-constant splats as well, but it's unclear
17691 // that would be a benefit as it would not eliminate any operations, just
17692 // perform one more step in scalar code before moving to the vector unit.
17693 if (BuildVectorSDNode *BV =
17694 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17695 // Bail out if the vector isn't a constant.
17696 if (!BV->isConstant())
17697 return SDValue();
17698
17699 // Everything checks out. Build up the new and improved node.
17700 SDLoc DL(N);
17701 EVT IntVT = BV->getValueType(0);
17702 // Create a new constant of the appropriate type for the transformed
17703 // DAG.
17704 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17705 // The AND node needs bitcasts to/from an integer vector type around it.
17706 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17707 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17708 N->getOperand(0)->getOperand(0), MaskConst);
17709 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17710 return Res;
17711 }
17712
17713 return SDValue();
17714}
17715
17717 const AArch64Subtarget *Subtarget) {
17718 // First try to optimize away the conversion when it's conditionally from
17719 // a constant. Vectors only.
17721 return Res;
17722
17723 EVT VT = N->getValueType(0);
17724 if (VT != MVT::f32 && VT != MVT::f64)
17725 return SDValue();
17726
17727 // Only optimize when the source and destination types have the same width.
17728 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17729 return SDValue();
17730
17731 // If the result of an integer load is only used by an integer-to-float
17732 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17733 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17734 SDValue N0 = N->getOperand(0);
17735 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17736 N0.hasOneUse() &&
17737 // Do not change the width of a volatile load.
17738 !cast<LoadSDNode>(N0)->isVolatile()) {
17739 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17740 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17741 LN0->getPointerInfo(), LN0->getAlign(),
17742 LN0->getMemOperand()->getFlags());
17743
17744 // Make sure successors of the original load stay after it by updating them
17745 // to use the new Chain.
17746 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17747
17748 unsigned Opcode =
17750 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17751 }
17752
17753 return SDValue();
17754}
17755
17756/// Fold a floating-point multiply by power of two into floating-point to
17757/// fixed-point conversion.
17760 const AArch64Subtarget *Subtarget) {
17761 if (!Subtarget->isNeonAvailable())
17762 return SDValue();
17763
17764 if (!N->getValueType(0).isSimple())
17765 return SDValue();
17766
17767 SDValue Op = N->getOperand(0);
17768 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17769 return SDValue();
17770
17771 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17772 return SDValue();
17773
17774 SDValue ConstVec = Op->getOperand(1);
17775 if (!isa<BuildVectorSDNode>(ConstVec))
17776 return SDValue();
17777
17778 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17779 uint32_t FloatBits = FloatTy.getSizeInBits();
17780 if (FloatBits != 32 && FloatBits != 64 &&
17781 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17782 return SDValue();
17783
17784 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17785 uint32_t IntBits = IntTy.getSizeInBits();
17786 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17787 return SDValue();
17788
17789 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17790 if (IntBits > FloatBits)
17791 return SDValue();
17792
17793 BitVector UndefElements;
17794 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17795 int32_t Bits = IntBits == 64 ? 64 : 32;
17796 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17797 if (C == -1 || C == 0 || C > Bits)
17798 return SDValue();
17799
17800 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17801 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17802 return SDValue();
17803
17804 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17805 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17806 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17807 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17808 return SDValue();
17809 }
17810
17811 SDLoc DL(N);
17812 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17813 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17814 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17815 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17816 SDValue FixConv =
17818 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17819 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17820 // We can handle smaller integers by generating an extra trunc.
17821 if (IntBits < FloatBits)
17822 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17823
17824 return FixConv;
17825}
17826
17827/// Fold a floating-point divide by power of two into fixed-point to
17828/// floating-point conversion.
17831 const AArch64Subtarget *Subtarget) {
17832 if (!Subtarget->hasNEON())
17833 return SDValue();
17834
17835 SDValue Op = N->getOperand(0);
17836 unsigned Opc = Op->getOpcode();
17837 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17838 !Op.getOperand(0).getValueType().isSimple() ||
17839 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17840 return SDValue();
17841
17842 SDValue ConstVec = N->getOperand(1);
17843 if (!isa<BuildVectorSDNode>(ConstVec))
17844 return SDValue();
17845
17846 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17847 int32_t IntBits = IntTy.getSizeInBits();
17848 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17849 return SDValue();
17850
17851 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17852 int32_t FloatBits = FloatTy.getSizeInBits();
17853 if (FloatBits != 32 && FloatBits != 64)
17854 return SDValue();
17855
17856 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17857 if (IntBits > FloatBits)
17858 return SDValue();
17859
17860 BitVector UndefElements;
17861 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17862 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17863 if (C == -1 || C == 0 || C > FloatBits)
17864 return SDValue();
17865
17866 MVT ResTy;
17867 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17868 switch (NumLanes) {
17869 default:
17870 return SDValue();
17871 case 2:
17872 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17873 break;
17874 case 4:
17875 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17876 break;
17877 }
17878
17879 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17880 return SDValue();
17881
17882 SDLoc DL(N);
17883 SDValue ConvInput = Op.getOperand(0);
17884 bool IsSigned = Opc == ISD::SINT_TO_FP;
17885 if (IntBits < FloatBits)
17886 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17887 ResTy, ConvInput);
17888
17889 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17890 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17891 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17892 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17893 DAG.getConstant(C, DL, MVT::i32));
17894}
17895
17897 const AArch64TargetLowering &TLI) {
17898 EVT VT = N->getValueType(0);
17899 SelectionDAG &DAG = DCI.DAG;
17900 SDLoc DL(N);
17901
17902 if (!VT.isVector())
17903 return SDValue();
17904
17905 // The combining code currently only works for NEON vectors. In particular,
17906 // it does not work for SVE when dealing with vectors wider than 128 bits.
17907 // It also doesn't work for streaming mode because it causes generating
17908 // bsl instructions that are invalid in streaming mode.
17911 return SDValue();
17912
17913 SDValue N0 = N->getOperand(0);
17914 if (N0.getOpcode() != ISD::AND)
17915 return SDValue();
17916
17917 SDValue N1 = N->getOperand(1);
17918 if (N1.getOpcode() != ISD::AND)
17919 return SDValue();
17920
17921 // InstCombine does (not (neg a)) => (add a -1).
17922 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17923 // Loop over all combinations of AND operands.
17924 for (int i = 1; i >= 0; --i) {
17925 for (int j = 1; j >= 0; --j) {
17926 SDValue O0 = N0->getOperand(i);
17927 SDValue O1 = N1->getOperand(j);
17928 SDValue Sub, Add, SubSibling, AddSibling;
17929
17930 // Find a SUB and an ADD operand, one from each AND.
17931 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17932 Sub = O0;
17933 Add = O1;
17934 SubSibling = N0->getOperand(1 - i);
17935 AddSibling = N1->getOperand(1 - j);
17936 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17937 Add = O0;
17938 Sub = O1;
17939 AddSibling = N0->getOperand(1 - i);
17940 SubSibling = N1->getOperand(1 - j);
17941 } else
17942 continue;
17943
17945 continue;
17946
17947 // Constant ones is always righthand operand of the Add.
17948 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
17949 continue;
17950
17951 if (Sub.getOperand(1) != Add.getOperand(0))
17952 continue;
17953
17954 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
17955 }
17956 }
17957
17958 // (or (and a b) (and (not a) c)) => (bsl a b c)
17959 // We only have to look for constant vectors here since the general, variable
17960 // case can be handled in TableGen.
17961 unsigned Bits = VT.getScalarSizeInBits();
17962 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
17963 for (int i = 1; i >= 0; --i)
17964 for (int j = 1; j >= 0; --j) {
17965 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
17966 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
17967 if (!BVN0 || !BVN1)
17968 continue;
17969
17970 bool FoundMatch = true;
17971 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
17972 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
17973 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
17974 if (!CN0 || !CN1 ||
17975 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
17976 FoundMatch = false;
17977 break;
17978 }
17979 }
17980
17981 if (FoundMatch)
17982 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
17983 N0->getOperand(1 - i), N1->getOperand(1 - j));
17984 }
17985
17986 return SDValue();
17987}
17988
17989// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
17990// convert to csel(ccmp(.., cc0)), depending on cc1:
17991
17992// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17993// =>
17994// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
17995//
17996// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17997// =>
17998// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18000 EVT VT = N->getValueType(0);
18001 SDValue CSel0 = N->getOperand(0);
18002 SDValue CSel1 = N->getOperand(1);
18003
18004 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18005 CSel1.getOpcode() != AArch64ISD::CSEL)
18006 return SDValue();
18007
18008 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18009 return SDValue();
18010
18011 if (!isNullConstant(CSel0.getOperand(0)) ||
18012 !isOneConstant(CSel0.getOperand(1)) ||
18013 !isNullConstant(CSel1.getOperand(0)) ||
18014 !isOneConstant(CSel1.getOperand(1)))
18015 return SDValue();
18016
18017 SDValue Cmp0 = CSel0.getOperand(3);
18018 SDValue Cmp1 = CSel1.getOperand(3);
18021 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18022 return SDValue();
18023 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18024 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18025 std::swap(Cmp0, Cmp1);
18026 std::swap(CC0, CC1);
18027 }
18028
18029 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18030 return SDValue();
18031
18032 SDLoc DL(N);
18033 SDValue CCmp, Condition;
18034 unsigned NZCV;
18035
18036 if (N->getOpcode() == ISD::AND) {
18038 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18040 } else {
18042 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18044 }
18045
18046 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18047
18048 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18049 if (Op1 && Op1->getAPIntValue().isNegative() &&
18050 Op1->getAPIntValue().sgt(-32)) {
18051 // CCMP accept the constant int the range [0, 31]
18052 // if the Op1 is a constant in the range [-31, -1], we
18053 // can select to CCMN to avoid the extra mov
18054 SDValue AbsOp1 =
18055 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18056 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18057 NZCVOp, Condition, Cmp0);
18058 } else {
18059 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18060 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18061 }
18062 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18063 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18064 CCmp);
18065}
18066
18068 const AArch64Subtarget *Subtarget,
18069 const AArch64TargetLowering &TLI) {
18070 SelectionDAG &DAG = DCI.DAG;
18071 EVT VT = N->getValueType(0);
18072
18073 if (SDValue R = performANDORCSELCombine(N, DAG))
18074 return R;
18075
18076 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18077 return SDValue();
18078
18079 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18080 return Res;
18081
18082 return SDValue();
18083}
18084
18086 if (!MemVT.getVectorElementType().isSimple())
18087 return false;
18088
18089 uint64_t MaskForTy = 0ull;
18090 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18091 case MVT::i8:
18092 MaskForTy = 0xffull;
18093 break;
18094 case MVT::i16:
18095 MaskForTy = 0xffffull;
18096 break;
18097 case MVT::i32:
18098 MaskForTy = 0xffffffffull;
18099 break;
18100 default:
18101 return false;
18102 break;
18103 }
18104
18105 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18106 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18107 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18108
18109 return false;
18110}
18111
18113 SDValue LeafOp = SDValue(N, 0);
18114 SDValue Op = N->getOperand(0);
18115 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18116 LeafOp.getValueType() != Op.getValueType())
18117 Op = Op->getOperand(0);
18118 if (LeafOp.getValueType() == Op.getValueType())
18119 return Op;
18120 return SDValue();
18121}
18122
18125 SelectionDAG &DAG = DCI.DAG;
18126 SDValue Src = N->getOperand(0);
18127 unsigned Opc = Src->getOpcode();
18128
18129 // Zero/any extend of an unsigned unpack
18130 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18131 SDValue UnpkOp = Src->getOperand(0);
18132 SDValue Dup = N->getOperand(1);
18133
18134 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18135 return SDValue();
18136
18137 SDLoc DL(N);
18138 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18139 if (!C)
18140 return SDValue();
18141
18142 uint64_t ExtVal = C->getZExtValue();
18143
18144 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18145 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18146 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18147 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18148 };
18149
18150 // If the mask is fully covered by the unpack, we don't need to push
18151 // a new AND onto the operand
18152 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18153 if (MaskAndTypeMatch(EltTy))
18154 return Src;
18155
18156 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18157 // to see if the mask is all-ones of size MemTy.
18158 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18159 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18160 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18161 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18162 if (MaskAndTypeMatch(EltTy))
18163 return Src;
18164 }
18165
18166 // Truncate to prevent a DUP with an over wide constant
18167 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18168
18169 // Otherwise, make sure we propagate the AND to the operand
18170 // of the unpack
18171 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18172 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18173
18174 SDValue And = DAG.getNode(ISD::AND, DL,
18175 UnpkOp->getValueType(0), UnpkOp, Dup);
18176
18177 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18178 }
18179
18180 if (DCI.isBeforeLegalizeOps())
18181 return SDValue();
18182
18183 // If both sides of AND operations are i1 splat_vectors then
18184 // we can produce just i1 splat_vector as the result.
18185 if (isAllActivePredicate(DAG, N->getOperand(0)))
18186 return N->getOperand(1);
18187 if (isAllActivePredicate(DAG, N->getOperand(1)))
18188 return N->getOperand(0);
18189
18191 return SDValue();
18192
18193 SDValue Mask = N->getOperand(1);
18194
18195 if (!Src.hasOneUse())
18196 return SDValue();
18197
18198 EVT MemVT;
18199
18200 // SVE load instructions perform an implicit zero-extend, which makes them
18201 // perfect candidates for combining.
18202 switch (Opc) {
18206 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18207 break;
18223 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18224 break;
18225 default:
18226 return SDValue();
18227 }
18228
18229 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18230 return Src;
18231
18232 return SDValue();
18233}
18234
18235// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18238
18239 // This function performs an optimization on a specific pattern involving
18240 // an AND operation and SETCC (Set Condition Code) node.
18241
18242 SDValue SetCC = N->getOperand(0);
18243 EVT VT = N->getValueType(0);
18244 SelectionDAG &DAG = DCI.DAG;
18245
18246 // Checks if the current node (N) is used by any SELECT instruction and
18247 // returns an empty SDValue to avoid applying the optimization to prevent
18248 // incorrect results
18249 for (auto U : N->uses())
18250 if (U->getOpcode() == ISD::SELECT)
18251 return SDValue();
18252
18253 // Check if the operand is a SETCC node with floating-point comparison
18254 if (SetCC.getOpcode() == ISD::SETCC &&
18255 SetCC.getOperand(0).getValueType() == MVT::f32) {
18256
18257 SDValue Cmp;
18259
18260 // Check if the DAG is after legalization and if we can emit the conjunction
18261 if (!DCI.isBeforeLegalize() &&
18262 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18263
18265
18266 SDLoc DL(N);
18267 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18268 DAG.getConstant(0, DL, VT),
18269 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18270 }
18271 }
18272 return SDValue();
18273}
18274
18277 SelectionDAG &DAG = DCI.DAG;
18278 SDValue LHS = N->getOperand(0);
18279 SDValue RHS = N->getOperand(1);
18280 EVT VT = N->getValueType(0);
18281
18282 if (SDValue R = performANDORCSELCombine(N, DAG))
18283 return R;
18284
18285 if (SDValue R = performANDSETCCCombine(N,DCI))
18286 return R;
18287
18288 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18289 return SDValue();
18290
18291 if (VT.isScalableVector())
18292 return performSVEAndCombine(N, DCI);
18293
18294 // The combining code below works only for NEON vectors. In particular, it
18295 // does not work for SVE when dealing with vectors wider than 128 bits.
18296 if (!VT.is64BitVector() && !VT.is128BitVector())
18297 return SDValue();
18298
18299 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18300 if (!BVN)
18301 return SDValue();
18302
18303 // AND does not accept an immediate, so check if we can use a BIC immediate
18304 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18305 // pattern in isel, because some immediates may be lowered to the preferred
18306 // (and x, (movi imm)) form, even though an mvni representation also exists.
18307 APInt DefBits(VT.getSizeInBits(), 0);
18308 APInt UndefBits(VT.getSizeInBits(), 0);
18309 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18310 SDValue NewOp;
18311
18312 // Any bits known to already be 0 need not be cleared again, which can help
18313 // reduce the size of the immediate to one supported by the instruction.
18314 KnownBits Known = DAG.computeKnownBits(LHS);
18315 APInt ZeroSplat(VT.getSizeInBits(), 0);
18316 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18317 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18318 << (Known.Zero.getBitWidth() * I);
18319
18320 DefBits = ~(DefBits | ZeroSplat);
18321 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18322 DefBits, &LHS)) ||
18323 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18324 DefBits, &LHS)))
18325 return NewOp;
18326
18327 UndefBits = ~(UndefBits | ZeroSplat);
18328 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18329 UndefBits, &LHS)) ||
18330 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18331 UndefBits, &LHS)))
18332 return NewOp;
18333 }
18334
18335 return SDValue();
18336}
18337
18340 SelectionDAG &DAG = DCI.DAG;
18341 SDValue LHS = N->getOperand(0);
18342 SDValue RHS = N->getOperand(1);
18343 EVT VT = N->getValueType(0);
18344 SDLoc DL(N);
18345
18346 if (!N->getFlags().hasAllowReassociation())
18347 return SDValue();
18348
18349 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18350 auto ReassocComplex = [&](SDValue A, SDValue B) {
18351 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18352 return SDValue();
18353 unsigned Opc = A.getConstantOperandVal(0);
18354 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18355 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18356 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18357 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18358 return SDValue();
18359 SDValue VCMLA = DAG.getNode(
18360 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18361 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18362 A.getOperand(2), A.getOperand(3));
18363 VCMLA->setFlags(A->getFlags());
18364 return VCMLA;
18365 };
18366 if (SDValue R = ReassocComplex(LHS, RHS))
18367 return R;
18368 if (SDValue R = ReassocComplex(RHS, LHS))
18369 return R;
18370
18371 return SDValue();
18372}
18373
18374static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18375 switch (Opcode) {
18376 case ISD::STRICT_FADD:
18377 case ISD::FADD:
18378 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18379 case ISD::ADD:
18380 return VT == MVT::i64;
18381 default:
18382 return false;
18383 }
18384}
18385
18386static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18388
18390 if ((N.getOpcode() == ISD::SETCC) ||
18391 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18392 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18393 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18394 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18395 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18396 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18397 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18398 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18399 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18400 // get_active_lane_mask is lowered to a whilelo instruction.
18401 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18402 return true;
18403
18404 return false;
18405}
18406
18407// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18408// ... into: "ptrue p, all" + PTEST
18409static SDValue
18412 const AArch64Subtarget *Subtarget) {
18413 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18414 // Make sure PTEST can be legalised with illegal types.
18415 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18416 return SDValue();
18417
18418 SDValue N0 = N->getOperand(0);
18419 EVT VT = N0.getValueType();
18420
18421 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18422 !isNullConstant(N->getOperand(1)))
18423 return SDValue();
18424
18425 // Restricted the DAG combine to only cases where we're extracting from a
18426 // flag-setting operation.
18427 if (!isPredicateCCSettingOp(N0))
18428 return SDValue();
18429
18430 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18431 SelectionDAG &DAG = DCI.DAG;
18432 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18433 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18434}
18435
18436// Materialize : Idx = (add (mul vscale, NumEls), -1)
18437// i1 = extract_vector_elt t37, Constant:i64<Idx>
18438// ... into: "ptrue p, all" + PTEST
18439static SDValue
18442 const AArch64Subtarget *Subtarget) {
18443 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18444 // Make sure PTEST is legal types.
18445 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18446 return SDValue();
18447
18448 SDValue N0 = N->getOperand(0);
18449 EVT OpVT = N0.getValueType();
18450
18451 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18452 return SDValue();
18453
18454 // Idx == (add (mul vscale, NumEls), -1)
18455 SDValue Idx = N->getOperand(1);
18456 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18457 return SDValue();
18458
18459 SDValue VS = Idx.getOperand(0);
18460 if (VS.getOpcode() != ISD::VSCALE)
18461 return SDValue();
18462
18463 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18464 if (VS.getConstantOperandVal(0) != NumEls)
18465 return SDValue();
18466
18467 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18468 SelectionDAG &DAG = DCI.DAG;
18469 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18470 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18471}
18472
18473static SDValue
18475 const AArch64Subtarget *Subtarget) {
18476 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18477 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18478 return Res;
18479 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18480 return Res;
18481
18482 SelectionDAG &DAG = DCI.DAG;
18483 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18484
18485 EVT VT = N->getValueType(0);
18486 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18487 bool IsStrict = N0->isStrictFPOpcode();
18488
18489 // extract(dup x) -> x
18490 if (N0.getOpcode() == AArch64ISD::DUP)
18491 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18492 : N0.getOperand(0);
18493
18494 // Rewrite for pairwise fadd pattern
18495 // (f32 (extract_vector_elt
18496 // (fadd (vXf32 Other)
18497 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18498 // ->
18499 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18500 // (extract_vector_elt (vXf32 Other) 1))
18501 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18502 // we can only do this when it's used only by the extract_vector_elt.
18503 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18504 (!IsStrict || N0.hasOneUse())) {
18505 SDLoc DL(N0);
18506 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18507 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18508
18509 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18510 SDValue Other = N00;
18511
18512 // And handle the commutative case.
18513 if (!Shuffle) {
18514 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18515 Other = N01;
18516 }
18517
18518 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18519 Other == Shuffle->getOperand(0)) {
18520 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18521 DAG.getConstant(0, DL, MVT::i64));
18522 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18523 DAG.getConstant(1, DL, MVT::i64));
18524 if (!IsStrict)
18525 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18526
18527 // For strict_fadd we need uses of the final extract_vector to be replaced
18528 // with the strict_fadd, but we also need uses of the chain output of the
18529 // original strict_fadd to use the chain output of the new strict_fadd as
18530 // otherwise it may not be deleted.
18531 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18532 {VT, MVT::Other},
18533 {N0->getOperand(0), Extract1, Extract2});
18534 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18535 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18536 return SDValue(N, 0);
18537 }
18538 }
18539
18540 return SDValue();
18541}
18542
18545 SelectionDAG &DAG) {
18546 SDLoc dl(N);
18547 EVT VT = N->getValueType(0);
18548 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18549 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18550
18551 if (VT.isScalableVector())
18552 return SDValue();
18553
18554 // Optimize concat_vectors of truncated vectors, where the intermediate
18555 // type is illegal, to avoid said illegality, e.g.,
18556 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18557 // (v2i16 (truncate (v2i64)))))
18558 // ->
18559 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18560 // (v4i32 (bitcast (v2i64))),
18561 // <0, 2, 4, 6>)))
18562 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18563 // on both input and result type, so we might generate worse code.
18564 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18565 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18566 N1Opc == ISD::TRUNCATE) {
18567 SDValue N00 = N0->getOperand(0);
18568 SDValue N10 = N1->getOperand(0);
18569 EVT N00VT = N00.getValueType();
18570
18571 if (N00VT == N10.getValueType() &&
18572 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18573 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18574 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18576 for (size_t i = 0; i < Mask.size(); ++i)
18577 Mask[i] = i * 2;
18578 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18579 DAG.getVectorShuffle(
18580 MidVT, dl,
18581 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18582 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18583 }
18584 }
18585
18586 if (N->getOperand(0).getValueType() == MVT::v4i8) {
18587 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18588 // loads to prevent having to go through the v4i8 load legalization that
18589 // needs to extend each element into a larger type.
18590 if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
18591 if (V.getValueType() != MVT::v4i8)
18592 return false;
18593 if (V.isUndef())
18594 return true;
18595 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18596 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18597 LD->getExtensionType() == ISD::NON_EXTLOAD;
18598 })) {
18599 EVT NVT =
18600 EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
18602
18603 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18604 SDValue V = N->getOperand(i);
18605 if (V.isUndef())
18606 Ops.push_back(DAG.getUNDEF(MVT::f32));
18607 else {
18608 LoadSDNode *LD = cast<LoadSDNode>(V);
18609 SDValue NewLoad =
18610 DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
18611 LD->getMemOperand());
18612 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18613 Ops.push_back(NewLoad);
18614 }
18615 }
18616 return DAG.getBitcast(N->getValueType(0),
18617 DAG.getBuildVector(NVT, dl, Ops));
18618 }
18619 }
18620
18621 // Canonicalise concat_vectors to replace concatenations of truncated nots
18622 // with nots of concatenated truncates. This in some cases allows for multiple
18623 // redundant negations to be eliminated.
18624 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18625 // (v4i16 (truncate (not (v4i32)))))
18626 // ->
18627 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18628 // (v4i16 (truncate (v4i32)))))
18629 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18630 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18631 N->isOnlyUserOf(N1.getNode())) {
18632 auto isBitwiseVectorNegate = [](SDValue V) {
18633 return V->getOpcode() == ISD::XOR &&
18634 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18635 };
18636 SDValue N00 = N0->getOperand(0);
18637 SDValue N10 = N1->getOperand(0);
18638 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18639 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18640 return DAG.getNOT(
18641 dl,
18642 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18643 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18644 N00->getOperand(0)),
18645 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18646 N10->getOperand(0))),
18647 VT);
18648 }
18649 }
18650
18651 // Wait till after everything is legalized to try this. That way we have
18652 // legal vector types and such.
18653 if (DCI.isBeforeLegalizeOps())
18654 return SDValue();
18655
18656 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
18657 // destination size, combine into an avg of two contacts of the source
18658 // vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
18659 // concat(b, d))
18660 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18661 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
18662 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
18663 N0->hasOneUse() && N1->hasOneUse()) {
18664 SDValue N00 = N0->getOperand(0);
18665 SDValue N01 = N0->getOperand(1);
18666 SDValue N10 = N1->getOperand(0);
18667 SDValue N11 = N1->getOperand(1);
18668
18669 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18670 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18671 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18672 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18673 }
18674 }
18675
18676 auto IsRSHRN = [](SDValue Shr) {
18677 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18678 return false;
18679 SDValue Op = Shr.getOperand(0);
18680 EVT VT = Op.getValueType();
18681 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18682 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18683 return false;
18684
18685 APInt Imm;
18686 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18687 Imm = APInt(VT.getScalarSizeInBits(),
18688 Op.getOperand(1).getConstantOperandVal(0)
18689 << Op.getOperand(1).getConstantOperandVal(1));
18690 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18691 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18692 Imm = APInt(VT.getScalarSizeInBits(),
18693 Op.getOperand(1).getConstantOperandVal(0));
18694 else
18695 return false;
18696
18697 if (Imm != 1ULL << (ShtAmt - 1))
18698 return false;
18699 return true;
18700 };
18701
18702 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18703 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18704 ((IsRSHRN(N1) &&
18706 N1.isUndef())) {
18707 SDValue X = N0.getOperand(0).getOperand(0);
18708 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18709 : N1.getOperand(0).getOperand(0);
18710 EVT BVT =
18711 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18712 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18713 SDValue Add = DAG.getNode(
18714 ISD::ADD, dl, BVT, CC,
18715 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18716 SDValue Shr =
18717 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18718 return Shr;
18719 }
18720
18721 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18722 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18723 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18724 N0.getOperand(1) == N1.getOperand(1)) {
18725 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18726 DAG.getUNDEF(N0.getValueType()));
18727 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18728 DAG.getUNDEF(N0.getValueType()));
18729 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18730 }
18731
18732 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18733 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18734 // canonicalise to that.
18735 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18736 assert(VT.getScalarSizeInBits() == 64);
18737 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18738 DAG.getConstant(0, dl, MVT::i64));
18739 }
18740
18741 // Canonicalise concat_vectors so that the right-hand vector has as few
18742 // bit-casts as possible before its real operation. The primary matching
18743 // destination for these operations will be the narrowing "2" instructions,
18744 // which depend on the operation being performed on this right-hand vector.
18745 // For example,
18746 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18747 // becomes
18748 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18749
18750 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18751 return SDValue();
18752 SDValue RHS = N1->getOperand(0);
18753 MVT RHSTy = RHS.getValueType().getSimpleVT();
18754 // If the RHS is not a vector, this is not the pattern we're looking for.
18755 if (!RHSTy.isVector())
18756 return SDValue();
18757
18758 LLVM_DEBUG(
18759 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18760
18761 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18762 RHSTy.getVectorNumElements() * 2);
18763 return DAG.getNode(ISD::BITCAST, dl, VT,
18764 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18765 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18766 RHS));
18767}
18768
18769static SDValue
18771 SelectionDAG &DAG) {
18772 if (DCI.isBeforeLegalizeOps())
18773 return SDValue();
18774
18775 EVT VT = N->getValueType(0);
18776 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18777 return SDValue();
18778
18779 SDValue V = N->getOperand(0);
18780
18781 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18782 // blocks this combine because the non-const case requires custom lowering.
18783 //
18784 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18785 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18786 if (isa<ConstantSDNode>(V.getOperand(0)))
18787 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18788
18789 return SDValue();
18790}
18791
18792static SDValue
18794 SelectionDAG &DAG) {
18795 SDLoc DL(N);
18796 SDValue Vec = N->getOperand(0);
18797 SDValue SubVec = N->getOperand(1);
18798 uint64_t IdxVal = N->getConstantOperandVal(2);
18799 EVT VecVT = Vec.getValueType();
18800 EVT SubVT = SubVec.getValueType();
18801
18802 // Only do this for legal fixed vector types.
18803 if (!VecVT.isFixedLengthVector() ||
18804 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18805 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18806 return SDValue();
18807
18808 // Ignore widening patterns.
18809 if (IdxVal == 0 && Vec.isUndef())
18810 return SDValue();
18811
18812 // Subvector must be half the width and an "aligned" insertion.
18813 unsigned NumSubElts = SubVT.getVectorNumElements();
18814 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18815 (IdxVal != 0 && IdxVal != NumSubElts))
18816 return SDValue();
18817
18818 // Fold insert_subvector -> concat_vectors
18819 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18820 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18821 SDValue Lo, Hi;
18822 if (IdxVal == 0) {
18823 Lo = SubVec;
18824 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18825 DAG.getVectorIdxConstant(NumSubElts, DL));
18826 } else {
18827 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18828 DAG.getVectorIdxConstant(0, DL));
18829 Hi = SubVec;
18830 }
18831 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18832}
18833
18836 SelectionDAG &DAG) {
18837 // Wait until after everything is legalized to try this. That way we have
18838 // legal vector types and such.
18839 if (DCI.isBeforeLegalizeOps())
18840 return SDValue();
18841 // Transform a scalar conversion of a value from a lane extract into a
18842 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18843 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18844 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18845 //
18846 // The second form interacts better with instruction selection and the
18847 // register allocator to avoid cross-class register copies that aren't
18848 // coalescable due to a lane reference.
18849
18850 // Check the operand and see if it originates from a lane extract.
18851 SDValue Op1 = N->getOperand(1);
18853 return SDValue();
18854
18855 // Yep, no additional predication needed. Perform the transform.
18856 SDValue IID = N->getOperand(0);
18857 SDValue Shift = N->getOperand(2);
18858 SDValue Vec = Op1.getOperand(0);
18859 SDValue Lane = Op1.getOperand(1);
18860 EVT ResTy = N->getValueType(0);
18861 EVT VecResTy;
18862 SDLoc DL(N);
18863
18864 // The vector width should be 128 bits by the time we get here, even
18865 // if it started as 64 bits (the extract_vector handling will have
18866 // done so). Bail if it is not.
18867 if (Vec.getValueSizeInBits() != 128)
18868 return SDValue();
18869
18870 if (Vec.getValueType() == MVT::v4i32)
18871 VecResTy = MVT::v4f32;
18872 else if (Vec.getValueType() == MVT::v2i64)
18873 VecResTy = MVT::v2f64;
18874 else
18875 return SDValue();
18876
18877 SDValue Convert =
18878 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18879 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18880}
18881
18882// AArch64 high-vector "long" operations are formed by performing the non-high
18883// version on an extract_subvector of each operand which gets the high half:
18884//
18885// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18886//
18887// However, there are cases which don't have an extract_high explicitly, but
18888// have another operation that can be made compatible with one for free. For
18889// example:
18890//
18891// (dupv64 scalar) --> (extract_high (dup128 scalar))
18892//
18893// This routine does the actual conversion of such DUPs, once outer routines
18894// have determined that everything else is in order.
18895// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18896// similarly here.
18898 MVT VT = N.getSimpleValueType();
18899 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18900 N.getConstantOperandVal(1) == 0)
18901 N = N.getOperand(0);
18902
18903 switch (N.getOpcode()) {
18904 case AArch64ISD::DUP:
18909 case AArch64ISD::MOVI:
18915 break;
18916 default:
18917 // FMOV could be supported, but isn't very useful, as it would only occur
18918 // if you passed a bitcast' floating point immediate to an eligible long
18919 // integer op (addl, smull, ...).
18920 return SDValue();
18921 }
18922
18923 if (!VT.is64BitVector())
18924 return SDValue();
18925
18926 SDLoc DL(N);
18927 unsigned NumElems = VT.getVectorNumElements();
18928 if (N.getValueType().is64BitVector()) {
18929 MVT ElementTy = VT.getVectorElementType();
18930 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
18931 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
18932 }
18933
18934 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18935 DAG.getConstant(NumElems, DL, MVT::i64));
18936}
18937
18939 if (N.getOpcode() == ISD::BITCAST)
18940 N = N.getOperand(0);
18941 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18942 return false;
18943 if (N.getOperand(0).getValueType().isScalableVector())
18944 return false;
18945 return N.getConstantOperandAPInt(1) ==
18946 N.getOperand(0).getValueType().getVectorNumElements() / 2;
18947}
18948
18949/// Helper structure to keep track of ISD::SET_CC operands.
18954};
18955
18956/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
18958 const SDValue *Cmp;
18960};
18961
18962/// Helper structure to keep track of SetCC information.
18966};
18967
18968/// Helper structure to be able to read SetCC information. If set to
18969/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
18970/// GenericSetCCInfo.
18974};
18975
18976/// Check whether or not \p Op is a SET_CC operation, either a generic or
18977/// an
18978/// AArch64 lowered one.
18979/// \p SetCCInfo is filled accordingly.
18980/// \post SetCCInfo is meanginfull only when this function returns true.
18981/// \return True when Op is a kind of SET_CC operation.
18983 // If this is a setcc, this is straight forward.
18984 if (Op.getOpcode() == ISD::SETCC) {
18985 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
18986 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
18987 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18988 SetCCInfo.IsAArch64 = false;
18989 return true;
18990 }
18991 // Otherwise, check if this is a matching csel instruction.
18992 // In other words:
18993 // - csel 1, 0, cc
18994 // - csel 0, 1, !cc
18995 if (Op.getOpcode() != AArch64ISD::CSEL)
18996 return false;
18997 // Set the information about the operands.
18998 // TODO: we want the operands of the Cmp not the csel
18999 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19000 SetCCInfo.IsAArch64 = true;
19001 SetCCInfo.Info.AArch64.CC =
19002 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19003
19004 // Check that the operands matches the constraints:
19005 // (1) Both operands must be constants.
19006 // (2) One must be 1 and the other must be 0.
19007 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19008 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19009
19010 // Check (1).
19011 if (!TValue || !FValue)
19012 return false;
19013
19014 // Check (2).
19015 if (!TValue->isOne()) {
19016 // Update the comparison when we are interested in !cc.
19017 std::swap(TValue, FValue);
19018 SetCCInfo.Info.AArch64.CC =
19020 }
19021 return TValue->isOne() && FValue->isZero();
19022}
19023
19024// Returns true if Op is setcc or zext of setcc.
19025static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19026 if (isSetCC(Op, Info))
19027 return true;
19028 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19029 isSetCC(Op->getOperand(0), Info));
19030}
19031
19032// The folding we want to perform is:
19033// (add x, [zext] (setcc cc ...) )
19034// -->
19035// (csel x, (add x, 1), !cc ...)
19036//
19037// The latter will get matched to a CSINC instruction.
19039 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19040 SDValue LHS = Op->getOperand(0);
19041 SDValue RHS = Op->getOperand(1);
19042 SetCCInfoAndKind InfoAndKind;
19043
19044 // If both operands are a SET_CC, then we don't want to perform this
19045 // folding and create another csel as this results in more instructions
19046 // (and higher register usage).
19047 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19048 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19049 return SDValue();
19050
19051 // If neither operand is a SET_CC, give up.
19052 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19053 std::swap(LHS, RHS);
19054 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19055 return SDValue();
19056 }
19057
19058 // FIXME: This could be generatized to work for FP comparisons.
19059 EVT CmpVT = InfoAndKind.IsAArch64
19060 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19061 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19062 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19063 return SDValue();
19064
19065 SDValue CCVal;
19066 SDValue Cmp;
19067 SDLoc dl(Op);
19068 if (InfoAndKind.IsAArch64) {
19069 CCVal = DAG.getConstant(
19071 MVT::i32);
19072 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19073 } else
19074 Cmp = getAArch64Cmp(
19075 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19076 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19077 dl);
19078
19079 EVT VT = Op->getValueType(0);
19080 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19081 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19082}
19083
19084// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19086 EVT VT = N->getValueType(0);
19087 // Only scalar integer and vector types.
19088 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19089 return SDValue();
19090
19091 SDValue LHS = N->getOperand(0);
19092 SDValue RHS = N->getOperand(1);
19093 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19094 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19095 return SDValue();
19096
19097 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19098 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19099 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19100 return SDValue();
19101
19102 SDValue Op1 = LHS->getOperand(0);
19103 SDValue Op2 = RHS->getOperand(0);
19104 EVT OpVT1 = Op1.getValueType();
19105 EVT OpVT2 = Op2.getValueType();
19106 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19107 Op2.getOpcode() != AArch64ISD::UADDV ||
19108 OpVT1.getVectorElementType() != VT)
19109 return SDValue();
19110
19111 SDValue Val1 = Op1.getOperand(0);
19112 SDValue Val2 = Op2.getOperand(0);
19113 EVT ValVT = Val1->getValueType(0);
19114 SDLoc DL(N);
19115 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19116 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19117 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19118 DAG.getConstant(0, DL, MVT::i64));
19119}
19120
19121/// Perform the scalar expression combine in the form of:
19122/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19123/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19125 EVT VT = N->getValueType(0);
19126 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19127 return SDValue();
19128
19129 SDValue LHS = N->getOperand(0);
19130 SDValue RHS = N->getOperand(1);
19131
19132 // Handle commutivity.
19133 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19134 LHS.getOpcode() != AArch64ISD::CSNEG) {
19135 std::swap(LHS, RHS);
19136 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19137 LHS.getOpcode() != AArch64ISD::CSNEG) {
19138 return SDValue();
19139 }
19140 }
19141
19142 if (!LHS.hasOneUse())
19143 return SDValue();
19144
19145 AArch64CC::CondCode AArch64CC =
19146 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19147
19148 // The CSEL should include a const one operand, and the CSNEG should include
19149 // One or NegOne operand.
19150 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19151 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19152 if (!CTVal || !CFVal)
19153 return SDValue();
19154
19155 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19156 (CTVal->isOne() || CFVal->isOne())) &&
19157 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19158 (CTVal->isOne() || CFVal->isAllOnes())))
19159 return SDValue();
19160
19161 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19162 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19163 !CFVal->isOne()) {
19164 std::swap(CTVal, CFVal);
19165 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19166 }
19167
19168 SDLoc DL(N);
19169 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19170 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19171 !CFVal->isAllOnes()) {
19172 APInt C = -1 * CFVal->getAPIntValue();
19173 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19174 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19175 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19176 }
19177
19178 // It might be neutral for larger constants, as the immediate need to be
19179 // materialized in a register.
19180 APInt ADDC = CTVal->getAPIntValue();
19181 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19182 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19183 return SDValue();
19184
19185 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19186 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19187 "Unexpected constant value");
19188
19189 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19190 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19191 SDValue Cmp = LHS.getOperand(3);
19192
19193 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19194}
19195
19196// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19198 EVT VT = N->getValueType(0);
19199 if (N->getOpcode() != ISD::ADD)
19200 return SDValue();
19201
19202 SDValue Dot = N->getOperand(0);
19203 SDValue A = N->getOperand(1);
19204 // Handle commutivity
19205 auto isZeroDot = [](SDValue Dot) {
19206 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19207 Dot.getOpcode() == AArch64ISD::SDOT) &&
19209 };
19210 if (!isZeroDot(Dot))
19211 std::swap(Dot, A);
19212 if (!isZeroDot(Dot))
19213 return SDValue();
19214
19215 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19216 Dot.getOperand(2));
19217}
19218
19220 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19221}
19222
19224 SDLoc DL(Op);
19225 EVT VT = Op.getValueType();
19226 SDValue Zero = DAG.getConstant(0, DL, VT);
19227 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19228}
19229
19230// Try to fold
19231//
19232// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19233//
19234// The folding helps csel to be matched with csneg without generating
19235// redundant neg instruction, which includes negation of the csel expansion
19236// of abs node lowered by lowerABS.
19238 if (!isNegatedInteger(SDValue(N, 0)))
19239 return SDValue();
19240
19241 SDValue CSel = N->getOperand(1);
19242 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19243 return SDValue();
19244
19245 SDValue N0 = CSel.getOperand(0);
19246 SDValue N1 = CSel.getOperand(1);
19247
19248 // If both of them is not negations, it's not worth the folding as it
19249 // introduces two additional negations while reducing one negation.
19250 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19251 return SDValue();
19252
19253 SDValue N0N = getNegatedInteger(N0, DAG);
19254 SDValue N1N = getNegatedInteger(N1, DAG);
19255
19256 SDLoc DL(N);
19257 EVT VT = CSel.getValueType();
19258 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19259 CSel.getOperand(3));
19260}
19261
19262// The basic add/sub long vector instructions have variants with "2" on the end
19263// which act on the high-half of their inputs. They are normally matched by
19264// patterns like:
19265//
19266// (add (zeroext (extract_high LHS)),
19267// (zeroext (extract_high RHS)))
19268// -> uaddl2 vD, vN, vM
19269//
19270// However, if one of the extracts is something like a duplicate, this
19271// instruction can still be used profitably. This function puts the DAG into a
19272// more appropriate form for those patterns to trigger.
19275 SelectionDAG &DAG = DCI.DAG;
19276 if (DCI.isBeforeLegalizeOps())
19277 return SDValue();
19278
19279 MVT VT = N->getSimpleValueType(0);
19280 if (!VT.is128BitVector()) {
19281 if (N->getOpcode() == ISD::ADD)
19282 return performSetccAddFolding(N, DAG);
19283 return SDValue();
19284 }
19285
19286 // Make sure both branches are extended in the same way.
19287 SDValue LHS = N->getOperand(0);
19288 SDValue RHS = N->getOperand(1);
19289 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19290 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19291 LHS.getOpcode() != RHS.getOpcode())
19292 return SDValue();
19293
19294 unsigned ExtType = LHS.getOpcode();
19295
19296 // It's not worth doing if at least one of the inputs isn't already an
19297 // extract, but we don't know which it'll be so we have to try both.
19298 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19299 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19300 if (!RHS.getNode())
19301 return SDValue();
19302
19303 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19304 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19305 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19306 if (!LHS.getNode())
19307 return SDValue();
19308
19309 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19310 }
19311
19312 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19313}
19314
19315static bool isCMP(SDValue Op) {
19316 return Op.getOpcode() == AArch64ISD::SUBS &&
19317 !Op.getNode()->hasAnyUseOfValue(0);
19318}
19319
19320// (CSEL 1 0 CC Cond) => CC
19321// (CSEL 0 1 CC Cond) => !CC
19322static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19323 if (Op.getOpcode() != AArch64ISD::CSEL)
19324 return std::nullopt;
19325 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19326 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19327 return std::nullopt;
19328 SDValue OpLHS = Op.getOperand(0);
19329 SDValue OpRHS = Op.getOperand(1);
19330 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19331 return CC;
19332 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19333 return getInvertedCondCode(CC);
19334
19335 return std::nullopt;
19336}
19337
19338// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19339// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19340static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19341 SDValue CmpOp = Op->getOperand(2);
19342 if (!isCMP(CmpOp))
19343 return SDValue();
19344
19345 if (IsAdd) {
19346 if (!isOneConstant(CmpOp.getOperand(1)))
19347 return SDValue();
19348 } else {
19349 if (!isNullConstant(CmpOp.getOperand(0)))
19350 return SDValue();
19351 }
19352
19353 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19354 auto CC = getCSETCondCode(CsetOp);
19355 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19356 return SDValue();
19357
19358 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19359 Op->getOperand(0), Op->getOperand(1),
19360 CsetOp.getOperand(3));
19361}
19362
19363// (ADC x 0 cond) => (CINC x HS cond)
19365 SDValue LHS = N->getOperand(0);
19366 SDValue RHS = N->getOperand(1);
19367 SDValue Cond = N->getOperand(2);
19368
19369 if (!isNullConstant(RHS))
19370 return SDValue();
19371
19372 EVT VT = N->getValueType(0);
19373 SDLoc DL(N);
19374
19375 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19376 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19377 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19378}
19379
19380// Transform vector add(zext i8 to i32, zext i8 to i32)
19381// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19382// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19383// extends.
19385 EVT VT = N->getValueType(0);
19386 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19387 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19388 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19389 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19390 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19391 N->getOperand(0).getOperand(0).getValueType() !=
19392 N->getOperand(1).getOperand(0).getValueType())
19393 return SDValue();
19394
19395 SDValue N0 = N->getOperand(0).getOperand(0);
19396 SDValue N1 = N->getOperand(1).getOperand(0);
19397 EVT InVT = N0.getValueType();
19398
19399 EVT S1 = InVT.getScalarType();
19400 EVT S2 = VT.getScalarType();
19401 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19402 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19403 SDLoc DL(N);
19404 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19407 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19408 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19409 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19410 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19411 }
19412 return SDValue();
19413}
19414
19417 SelectionDAG &DAG) {
19418 SDLoc DL(N);
19419 EVT VT = N->getValueType(0);
19420
19421 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19422 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19423 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19424 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19425 Elt1->getOpcode() == ISD::FP_ROUND &&
19426 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19427 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19428 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19430 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19431 // Constant index.
19432 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19433 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19434 Elt0->getOperand(0)->getOperand(0) ==
19435 Elt1->getOperand(0)->getOperand(0) &&
19436 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19437 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19438 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19439 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19440 SDValue HighLanes;
19441 if (Elt2->getOpcode() == ISD::UNDEF &&
19442 Elt3->getOpcode() == ISD::UNDEF) {
19443 HighLanes = DAG.getUNDEF(MVT::v2f32);
19444 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19445 Elt3->getOpcode() == ISD::FP_ROUND &&
19446 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19447 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19448 Elt2->getConstantOperandVal(1) ==
19449 Elt3->getConstantOperandVal(1) &&
19450 Elt2->getOperand(0)->getOpcode() ==
19452 Elt3->getOperand(0)->getOpcode() ==
19454 // Constant index.
19455 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19456 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19457 Elt2->getOperand(0)->getOperand(0) ==
19458 Elt3->getOperand(0)->getOperand(0) &&
19459 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19460 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19461 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19462 HighLanes =
19463 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19464 }
19465 if (HighLanes) {
19466 SDValue DoubleToSingleSticky =
19467 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19468 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19469 DoubleToSingleSticky, HighLanes);
19470 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19471 Elt0->getOperand(1));
19472 }
19473 }
19474 }
19475 }
19476
19477 if (VT == MVT::v2f64) {
19478 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19479 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19480 Elt1->getOpcode() == ISD::FP_EXTEND &&
19482 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19483 Elt0->getOperand(0)->getOperand(0) ==
19484 Elt1->getOperand(0)->getOperand(0) &&
19485 // Constant index.
19486 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19487 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19488 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19489 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19490 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19491 // ResultType's known minimum vector length.
19492 Elt0->getOperand(0)->getConstantOperandVal(1) %
19494 0) {
19495 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19496 if (SrcVec.getValueType() == MVT::v4f16 ||
19497 SrcVec.getValueType() == MVT::v4bf16) {
19498 SDValue HalfToSingle =
19499 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19500 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19501 SDValue Extract = DAG.getNode(
19503 HalfToSingle, SubvectorIdx);
19504 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19505 }
19506 }
19507 }
19508
19509 // A build vector of two extracted elements is equivalent to an
19510 // extract subvector where the inner vector is any-extended to the
19511 // extract_vector_elt VT.
19512 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19513 // (extract_elt_iXX_to_i32 vec Idx+1))
19514 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19515
19516 // For now, only consider the v2i32 case, which arises as a result of
19517 // legalization.
19518 if (VT != MVT::v2i32)
19519 return SDValue();
19520
19521 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19522 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19523 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19524 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19525 // Constant index.
19526 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19527 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19528 // Both EXTRACT_VECTOR_ELT from same vector...
19529 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19530 // ... and contiguous. First element's index +1 == second element's index.
19531 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19532 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19533 // ResultType's known minimum vector length.
19534 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19535 SDValue VecToExtend = Elt0->getOperand(0);
19536 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19537 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19538 return SDValue();
19539
19540 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19541
19542 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19543 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19544 SubvectorIdx);
19545 }
19546
19547 return SDValue();
19548}
19549
19551 SelectionDAG &DAG) {
19552 EVT VT = N->getValueType(0);
19553 SDValue N0 = N->getOperand(0);
19554 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19555 N0.getOpcode() == AArch64ISD::DUP) {
19556 SDValue Op = N0.getOperand(0);
19557 if (VT.getScalarType() == MVT::i32 &&
19558 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19559 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19560 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19561 }
19562
19563 return SDValue();
19564}
19565
19566// Check an node is an extend or shift operand
19568 unsigned Opcode = N.getOpcode();
19569 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19570 EVT SrcVT;
19571 if (Opcode == ISD::SIGN_EXTEND_INREG)
19572 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19573 else
19574 SrcVT = N.getOperand(0).getValueType();
19575
19576 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19577 } else if (Opcode == ISD::AND) {
19578 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19579 if (!CSD)
19580 return false;
19581 uint64_t AndMask = CSD->getZExtValue();
19582 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19583 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19584 return isa<ConstantSDNode>(N.getOperand(1));
19585 }
19586
19587 return false;
19588}
19589
19590// (N - Y) + Z --> (Z - Y) + N
19591// when N is an extend or shift operand
19593 SelectionDAG &DAG) {
19594 auto IsOneUseExtend = [](SDValue N) {
19595 return N.hasOneUse() && isExtendOrShiftOperand(N);
19596 };
19597
19598 // DAGCombiner will revert the combination when Z is constant cause
19599 // dead loop. So don't enable the combination when Z is constant.
19600 // If Z is one use shift C, we also can't do the optimization.
19601 // It will falling to self infinite loop.
19602 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19603 return SDValue();
19604
19605 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19606 return SDValue();
19607
19608 SDValue Shift = SUB.getOperand(0);
19609 if (!IsOneUseExtend(Shift))
19610 return SDValue();
19611
19612 SDLoc DL(N);
19613 EVT VT = N->getValueType(0);
19614
19615 SDValue Y = SUB.getOperand(1);
19616 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19617 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19618}
19619
19621 SelectionDAG &DAG) {
19622 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19623 // commutative.
19624 if (N->getOpcode() != ISD::ADD)
19625 return SDValue();
19626
19627 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19628 // shifted register is only available for i32 and i64.
19629 EVT VT = N->getValueType(0);
19630 if (VT != MVT::i32 && VT != MVT::i64)
19631 return SDValue();
19632
19633 SDLoc DL(N);
19634 SDValue LHS = N->getOperand(0);
19635 SDValue RHS = N->getOperand(1);
19636
19637 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19638 return Val;
19639 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19640 return Val;
19641
19642 uint64_t LHSImm = 0, RHSImm = 0;
19643 // If both operand are shifted by imm and shift amount is not greater than 4
19644 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19645 // on RHS.
19646 //
19647 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19648 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19649 // with LSL (shift > 4). For the rest of processors, this is no-op for
19650 // performance or correctness.
19651 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19652 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19653 RHSImm > 4 && LHS.hasOneUse())
19654 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19655
19656 return SDValue();
19657}
19658
19659// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19660// This reassociates it back to allow the creation of more mls instructions.
19662 if (N->getOpcode() != ISD::SUB)
19663 return SDValue();
19664
19665 SDValue Add = N->getOperand(1);
19666 SDValue X = N->getOperand(0);
19667 if (Add.getOpcode() != ISD::ADD)
19668 return SDValue();
19669
19670 if (!Add.hasOneUse())
19671 return SDValue();
19673 return SDValue();
19674
19675 SDValue M1 = Add.getOperand(0);
19676 SDValue M2 = Add.getOperand(1);
19677 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19678 M1.getOpcode() != AArch64ISD::UMULL)
19679 return SDValue();
19680 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19682 return SDValue();
19683
19684 EVT VT = N->getValueType(0);
19685 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19686 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19687}
19688
19689// Combine into mla/mls.
19690// This works on the patterns of:
19691// add v1, (mul v2, v3)
19692// sub v1, (mul v2, v3)
19693// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19694// It will transform the add/sub to a scalable version, so that we can
19695// make use of SVE's MLA/MLS that will be generated for that pattern
19696static SDValue
19698 SelectionDAG &DAG = DCI.DAG;
19699 // Make sure that the types are legal
19700 if (!DCI.isAfterLegalizeDAG())
19701 return SDValue();
19702 // Before using SVE's features, check first if it's available.
19703 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19704 return SDValue();
19705
19706 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19707 return SDValue();
19708
19709 if (!N->getValueType(0).isFixedLengthVector())
19710 return SDValue();
19711
19712 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19713 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19714 return SDValue();
19715
19716 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19717 return SDValue();
19718
19719 SDValue MulValue = Op1->getOperand(0);
19720 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19721 return SDValue();
19722
19723 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19724 return SDValue();
19725
19726 EVT ScalableVT = MulValue.getValueType();
19727 if (!ScalableVT.isScalableVector())
19728 return SDValue();
19729
19730 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19731 SDValue NewValue =
19732 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19733 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19734 };
19735
19736 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19737 return res;
19738 else if (N->getOpcode() == ISD::ADD)
19739 return performOpt(N->getOperand(1), N->getOperand(0));
19740
19741 return SDValue();
19742}
19743
19744// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19745// help, for example, to produce ssra from sshr+add.
19747 EVT VT = N->getValueType(0);
19748 if (VT != MVT::i64)
19749 return SDValue();
19750 SDValue Op0 = N->getOperand(0);
19751 SDValue Op1 = N->getOperand(1);
19752
19753 // At least one of the operands should be an extract, and the other should be
19754 // something that is easy to convert to v1i64 type (in this case a load).
19755 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19756 Op0.getOpcode() != ISD::LOAD)
19757 return SDValue();
19758 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19759 Op1.getOpcode() != ISD::LOAD)
19760 return SDValue();
19761
19762 SDLoc DL(N);
19763 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19764 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19765 Op0 = Op0.getOperand(0);
19766 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19767 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19768 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19769 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19770 Op1 = Op1.getOperand(0);
19771 } else
19772 return SDValue();
19773
19774 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19775 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19776 DAG.getConstant(0, DL, MVT::i64));
19777}
19778
19781 if (!BV->hasOneUse())
19782 return false;
19783 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19784 if (!Ld || !Ld->isSimple())
19785 return false;
19786 Loads.push_back(Ld);
19787 return true;
19788 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19790 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19791 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19792 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19793 return false;
19794 Loads.push_back(Ld);
19795 }
19796 return true;
19797 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19798 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19799 // are lowered. Note that this only comes up because we do not always visit
19800 // operands before uses. After that is fixed this can be removed and in the
19801 // meantime this is fairly specific to the lowering we expect from IR.
19802 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19803 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19804 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19805 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19806 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19807 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19808 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19809 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19810 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19811 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19812 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19813 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19814 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19815 B.getOperand(1).getNumOperands() != 4)
19816 return false;
19817 auto SV1 = cast<ShuffleVectorSDNode>(B);
19818 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19819 int NumElts = B.getValueType().getVectorNumElements();
19820 int NumSubElts = NumElts / 4;
19821 for (int I = 0; I < NumSubElts; I++) {
19822 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19823 if (SV1->getMaskElt(I) != I ||
19824 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19825 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19826 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19827 return false;
19828 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19829 if (SV2->getMaskElt(I) != I ||
19830 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19831 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19832 return false;
19833 }
19834 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19835 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19836 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19837 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19838 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19839 !Ld2->isSimple() || !Ld3->isSimple())
19840 return false;
19841 Loads.push_back(Ld0);
19842 Loads.push_back(Ld1);
19843 Loads.push_back(Ld2);
19844 Loads.push_back(Ld3);
19845 return true;
19846 }
19847 return false;
19848}
19849
19851 SelectionDAG &DAG,
19852 unsigned &NumSubLoads) {
19853 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19854 return false;
19855
19856 SmallVector<LoadSDNode *> Loads0, Loads1;
19857 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19858 isLoadOrMultipleLoads(Op1, Loads1)) {
19859 if (NumSubLoads && Loads0.size() != NumSubLoads)
19860 return false;
19861 NumSubLoads = Loads0.size();
19862 return Loads0.size() == Loads1.size() &&
19863 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19864 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19865 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19866 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19867 Size / 8, 1);
19868 });
19869 }
19870
19871 if (Op0.getOpcode() != Op1.getOpcode())
19872 return false;
19873
19874 switch (Op0.getOpcode()) {
19875 case ISD::ADD:
19876 case ISD::SUB:
19878 DAG, NumSubLoads) &&
19880 DAG, NumSubLoads);
19881 case ISD::SIGN_EXTEND:
19882 case ISD::ANY_EXTEND:
19883 case ISD::ZERO_EXTEND:
19884 EVT XVT = Op0.getOperand(0).getValueType();
19885 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19886 XVT.getScalarSizeInBits() != 32)
19887 return false;
19889 DAG, NumSubLoads);
19890 }
19891 return false;
19892}
19893
19894// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19895// into a single load of twice the size, that we extract the bottom part and top
19896// part so that the shl can use a shll2 instruction. The two loads in that
19897// example can also be larger trees of instructions, which are identical except
19898// for the leaves which are all loads offset from the LHS, including
19899// buildvectors of multiple loads. For example the RHS tree could be
19900// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19901// Whilst it can be common for the larger loads to replace LDP instructions
19902// (which doesn't gain anything on it's own), the larger loads can help create
19903// more efficient code, and in buildvectors prevent the need for ld1 lane
19904// inserts which can be slower than normal loads.
19906 EVT VT = N->getValueType(0);
19907 if (!VT.isFixedLengthVector() ||
19908 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19909 VT.getScalarSizeInBits() != 64))
19910 return SDValue();
19911
19912 SDValue Other = N->getOperand(0);
19913 SDValue Shift = N->getOperand(1);
19914 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19915 std::swap(Shift, Other);
19916 APInt ShiftAmt;
19917 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19918 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
19919 return SDValue();
19920
19921 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
19922 !ISD::isExtOpcode(Other.getOpcode()) ||
19923 Shift.getOperand(0).getOperand(0).getValueType() !=
19924 Other.getOperand(0).getValueType() ||
19925 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
19926 return SDValue();
19927
19928 SDValue Op0 = Other.getOperand(0);
19929 SDValue Op1 = Shift.getOperand(0).getOperand(0);
19930
19931 unsigned NumSubLoads = 0;
19932 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19933 return SDValue();
19934
19935 // Attempt to rule out some unprofitable cases using heuristics (some working
19936 // around suboptimal code generation), notably if the extend not be able to
19937 // use ushll2 instructions as the types are not large enough. Otherwise zip's
19938 // will need to be created which can increase the instruction count.
19939 unsigned NumElts = Op0.getValueType().getVectorNumElements();
19940 unsigned NumSubElts = NumElts / NumSubLoads;
19941 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
19942 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
19943 Op0.getValueType().getSizeInBits() < 128 &&
19945 return SDValue();
19946
19947 // Recreate the tree with the new combined loads.
19948 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
19949 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
19950 EVT DVT =
19952
19953 SmallVector<LoadSDNode *> Loads0, Loads1;
19954 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19955 isLoadOrMultipleLoads(Op1, Loads1)) {
19956 EVT LoadVT = EVT::getVectorVT(
19957 *DAG.getContext(), Op0.getValueType().getScalarType(),
19958 Op0.getValueType().getVectorNumElements() / Loads0.size());
19959 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19960
19961 SmallVector<SDValue> NewLoads;
19962 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
19963 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
19964 L0->getBasePtr(), L0->getPointerInfo(),
19965 L0->getOriginalAlign());
19966 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
19967 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
19968 NewLoads.push_back(Load);
19969 }
19970 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
19971 }
19972
19974 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
19975 Ops.push_back(GenCombinedTree(O0, O1, DAG));
19976 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
19977 };
19978 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
19979
19980 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
19981 int Hi = NumSubElts, Lo = 0;
19982 for (unsigned i = 0; i < NumSubLoads; i++) {
19983 for (unsigned j = 0; j < NumSubElts; j++) {
19984 LowMask[i * NumSubElts + j] = Lo++;
19985 HighMask[i * NumSubElts + j] = Hi++;
19986 }
19987 Lo += NumSubElts;
19988 Hi += NumSubElts;
19989 }
19990 SDLoc DL(N);
19991 SDValue Ext0, Ext1;
19992 // Extract the top and bottom lanes, then extend the result. Possibly extend
19993 // the result then extract the lanes if the two operands match as it produces
19994 // slightly smaller code.
19995 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
19997 NewOp, DAG.getConstant(0, DL, MVT::i64));
19998 SDValue SubH =
19999 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20000 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20001 SDValue Extr0 =
20002 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20003 SDValue Extr1 =
20004 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20005 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20006 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20007 } else {
20009 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20010 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20011 DAG.getConstant(0, DL, MVT::i64));
20012 SDValue SubH =
20013 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20014 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20015 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20016 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20017 }
20018 SDValue NShift =
20019 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20020 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20021}
20022
20025 // Try to change sum of two reductions.
20026 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20027 return Val;
20028 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20029 return Val;
20030 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20031 return Val;
20032 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20033 return Val;
20035 return Val;
20037 return Val;
20038 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20039 return Val;
20040 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20041 return Val;
20042 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20043 return Val;
20044
20045 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20046 return Val;
20047
20048 return performAddSubLongCombine(N, DCI);
20049}
20050
20051// Massage DAGs which we can use the high-half "long" operations on into
20052// something isel will recognize better. E.g.
20053//
20054// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20055// (aarch64_neon_umull (extract_high (v2i64 vec)))
20056// (extract_high (v2i64 (dup128 scalar)))))
20057//
20060 SelectionDAG &DAG) {
20061 if (DCI.isBeforeLegalizeOps())
20062 return SDValue();
20063
20064 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20065 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20066 assert(LHS.getValueType().is64BitVector() &&
20067 RHS.getValueType().is64BitVector() &&
20068 "unexpected shape for long operation");
20069
20070 // Either node could be a DUP, but it's not worth doing both of them (you'd
20071 // just as well use the non-high version) so look for a corresponding extract
20072 // operation on the other "wing".
20075 if (!RHS.getNode())
20076 return SDValue();
20079 if (!LHS.getNode())
20080 return SDValue();
20081 } else
20082 return SDValue();
20083
20084 if (IID == Intrinsic::not_intrinsic)
20085 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20086
20087 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20088 N->getOperand(0), LHS, RHS);
20089}
20090
20091static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20092 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20093 unsigned ElemBits = ElemTy.getSizeInBits();
20094
20095 int64_t ShiftAmount;
20096 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20097 APInt SplatValue, SplatUndef;
20098 unsigned SplatBitSize;
20099 bool HasAnyUndefs;
20100 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20101 HasAnyUndefs, ElemBits) ||
20102 SplatBitSize != ElemBits)
20103 return SDValue();
20104
20105 ShiftAmount = SplatValue.getSExtValue();
20106 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20107 ShiftAmount = CVN->getSExtValue();
20108 } else
20109 return SDValue();
20110
20111 // If the shift amount is zero, remove the shift intrinsic.
20112 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20113 return N->getOperand(1);
20114
20115 unsigned Opcode;
20116 bool IsRightShift;
20117 switch (IID) {
20118 default:
20119 llvm_unreachable("Unknown shift intrinsic");
20120 case Intrinsic::aarch64_neon_sqshl:
20121 Opcode = AArch64ISD::SQSHL_I;
20122 IsRightShift = false;
20123 break;
20124 case Intrinsic::aarch64_neon_uqshl:
20125 Opcode = AArch64ISD::UQSHL_I;
20126 IsRightShift = false;
20127 break;
20128 case Intrinsic::aarch64_neon_srshl:
20129 Opcode = AArch64ISD::SRSHR_I;
20130 IsRightShift = true;
20131 break;
20132 case Intrinsic::aarch64_neon_urshl:
20133 Opcode = AArch64ISD::URSHR_I;
20134 IsRightShift = true;
20135 break;
20136 case Intrinsic::aarch64_neon_sqshlu:
20137 Opcode = AArch64ISD::SQSHLU_I;
20138 IsRightShift = false;
20139 break;
20140 case Intrinsic::aarch64_neon_sshl:
20141 case Intrinsic::aarch64_neon_ushl:
20142 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20143 // left shift for positive shift amounts. For negative shifts we can use a
20144 // VASHR/VLSHR as appropiate.
20145 if (ShiftAmount < 0) {
20146 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20148 ShiftAmount = -ShiftAmount;
20149 } else
20150 Opcode = AArch64ISD::VSHL;
20151 IsRightShift = false;
20152 break;
20153 }
20154
20155 EVT VT = N->getValueType(0);
20156 SDValue Op = N->getOperand(1);
20157 SDLoc dl(N);
20158 if (VT == MVT::i64) {
20159 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20160 VT = MVT::v1i64;
20161 }
20162
20163 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20164 Op = DAG.getNode(Opcode, dl, VT, Op,
20165 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20166 if (N->getValueType(0) == MVT::i64)
20167 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20168 DAG.getConstant(0, dl, MVT::i64));
20169 return Op;
20170 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20171 Op = DAG.getNode(Opcode, dl, VT, Op,
20172 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20173 if (N->getValueType(0) == MVT::i64)
20174 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20175 DAG.getConstant(0, dl, MVT::i64));
20176 return Op;
20177 }
20178
20179 return SDValue();
20180}
20181
20182// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20183// the intrinsics must be legal and take an i32, this means there's almost
20184// certainly going to be a zext in the DAG which we can eliminate.
20185static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20186 SDValue AndN = N->getOperand(2);
20187 if (AndN.getOpcode() != ISD::AND)
20188 return SDValue();
20189
20190 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20191 if (!CMask || CMask->getZExtValue() != Mask)
20192 return SDValue();
20193
20194 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20195 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20196}
20197
20199 SelectionDAG &DAG) {
20200 SDLoc dl(N);
20201 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20202 DAG.getNode(Opc, dl,
20203 N->getOperand(1).getSimpleValueType(),
20204 N->getOperand(1)),
20205 DAG.getConstant(0, dl, MVT::i64));
20206}
20207
20209 SDLoc DL(N);
20210 SDValue Op1 = N->getOperand(1);
20211 SDValue Op2 = N->getOperand(2);
20212 EVT ScalarTy = Op2.getValueType();
20213 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20214 ScalarTy = MVT::i32;
20215
20216 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20217 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20218 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20219 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20220 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20221 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20222}
20223
20225 SDLoc dl(N);
20226 SDValue Scalar = N->getOperand(3);
20227 EVT ScalarTy = Scalar.getValueType();
20228
20229 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20230 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20231
20232 SDValue Passthru = N->getOperand(1);
20233 SDValue Pred = N->getOperand(2);
20234 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20235 Pred, Scalar, Passthru);
20236}
20237
20239 SDLoc dl(N);
20240 LLVMContext &Ctx = *DAG.getContext();
20241 EVT VT = N->getValueType(0);
20242
20243 assert(VT.isScalableVector() && "Expected a scalable vector.");
20244
20245 // Current lowering only supports the SVE-ACLE types.
20247 return SDValue();
20248
20249 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20250 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20251 EVT ByteVT =
20252 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20253
20254 // Convert everything to the domain of EXT (i.e bytes).
20255 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20256 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20257 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20258 DAG.getConstant(ElemSize, dl, MVT::i32));
20259
20260 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20261 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20262}
20263
20266 SelectionDAG &DAG) {
20267 if (DCI.isBeforeLegalize())
20268 return SDValue();
20269
20270 SDValue Comparator = N->getOperand(3);
20271 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20272 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20273 unsigned IID = getIntrinsicID(N);
20274 EVT VT = N->getValueType(0);
20275 EVT CmpVT = N->getOperand(2).getValueType();
20276 SDValue Pred = N->getOperand(1);
20277 SDValue Imm;
20278 SDLoc DL(N);
20279
20280 switch (IID) {
20281 default:
20282 llvm_unreachable("Called with wrong intrinsic!");
20283 break;
20284
20285 // Signed comparisons
20286 case Intrinsic::aarch64_sve_cmpeq_wide:
20287 case Intrinsic::aarch64_sve_cmpne_wide:
20288 case Intrinsic::aarch64_sve_cmpge_wide:
20289 case Intrinsic::aarch64_sve_cmpgt_wide:
20290 case Intrinsic::aarch64_sve_cmplt_wide:
20291 case Intrinsic::aarch64_sve_cmple_wide: {
20292 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20293 int64_t ImmVal = CN->getSExtValue();
20294 if (ImmVal >= -16 && ImmVal <= 15)
20295 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20296 else
20297 return SDValue();
20298 }
20299 break;
20300 }
20301 // Unsigned comparisons
20302 case Intrinsic::aarch64_sve_cmphs_wide:
20303 case Intrinsic::aarch64_sve_cmphi_wide:
20304 case Intrinsic::aarch64_sve_cmplo_wide:
20305 case Intrinsic::aarch64_sve_cmpls_wide: {
20306 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20307 uint64_t ImmVal = CN->getZExtValue();
20308 if (ImmVal <= 127)
20309 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20310 else
20311 return SDValue();
20312 }
20313 break;
20314 }
20315 }
20316
20317 if (!Imm)
20318 return SDValue();
20319
20320 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20321 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20322 N->getOperand(2), Splat, DAG.getCondCode(CC));
20323 }
20324
20325 return SDValue();
20326}
20327
20330 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20331
20332 SDLoc DL(Op);
20333 assert(Op.getValueType().isScalableVector() &&
20334 TLI.isTypeLegal(Op.getValueType()) &&
20335 "Expected legal scalable vector type!");
20336 assert(Op.getValueType() == Pg.getValueType() &&
20337 "Expected same type for PTEST operands");
20338
20339 // Ensure target specific opcodes are using legal type.
20340 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20341 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20342 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20343
20344 // Ensure operands have type nxv16i1.
20345 if (Op.getValueType() != MVT::nxv16i1) {
20348 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20349 else
20350 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20351 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20352 }
20353
20354 // Set condition code (CC) flags.
20355 SDValue Test = DAG.getNode(
20357 DL, MVT::Other, Pg, Op);
20358
20359 // Convert CC to integer based on requested condition.
20360 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20361 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20362 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20363 return DAG.getZExtOrTrunc(Res, DL, VT);
20364}
20365
20367 SelectionDAG &DAG) {
20368 SDLoc DL(N);
20369
20370 SDValue Pred = N->getOperand(1);
20371 SDValue VecToReduce = N->getOperand(2);
20372
20373 // NOTE: The integer reduction's result type is not always linked to the
20374 // operand's element type so we construct it from the intrinsic's result type.
20375 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20376 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20377
20378 // SVE reductions set the whole vector register with the first element
20379 // containing the reduction result, which we'll now extract.
20380 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20381 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20382 Zero);
20383}
20384
20386 SelectionDAG &DAG) {
20387 SDLoc DL(N);
20388
20389 SDValue Pred = N->getOperand(1);
20390 SDValue VecToReduce = N->getOperand(2);
20391
20392 EVT ReduceVT = VecToReduce.getValueType();
20393 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20394
20395 // SVE reductions set the whole vector register with the first element
20396 // containing the reduction result, which we'll now extract.
20397 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20398 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20399 Zero);
20400}
20401
20403 SelectionDAG &DAG) {
20404 SDLoc DL(N);
20405
20406 SDValue Pred = N->getOperand(1);
20407 SDValue InitVal = N->getOperand(2);
20408 SDValue VecToReduce = N->getOperand(3);
20409 EVT ReduceVT = VecToReduce.getValueType();
20410
20411 // Ordered reductions use the first lane of the result vector as the
20412 // reduction's initial value.
20413 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20414 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20415 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20416
20417 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20418
20419 // SVE reductions set the whole vector register with the first element
20420 // containing the reduction result, which we'll now extract.
20421 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20422 Zero);
20423}
20424
20425// If a merged operation has no inactive lanes we can relax it to a predicated
20426// or unpredicated operation, which potentially allows better isel (perhaps
20427// using immediate forms) or relaxing register reuse requirements.
20429 SelectionDAG &DAG, bool UnpredOp = false,
20430 bool SwapOperands = false) {
20431 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20432 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20433 SDValue Pg = N->getOperand(1);
20434 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20435 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20436
20437 // ISD way to specify an all active predicate.
20438 if (isAllActivePredicate(DAG, Pg)) {
20439 if (UnpredOp)
20440 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20441
20442 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20443 }
20444
20445 // FUTURE: SplatVector(true)
20446 return SDValue();
20447}
20448
20451 const AArch64Subtarget *Subtarget) {
20452 SelectionDAG &DAG = DCI.DAG;
20453 unsigned IID = getIntrinsicID(N);
20454 switch (IID) {
20455 default:
20456 break;
20457 case Intrinsic::get_active_lane_mask: {
20458 SDValue Res = SDValue();
20459 EVT VT = N->getValueType(0);
20460 if (VT.isFixedLengthVector()) {
20461 // We can use the SVE whilelo instruction to lower this intrinsic by
20462 // creating the appropriate sequence of scalable vector operations and
20463 // then extracting a fixed-width subvector from the scalable vector.
20464
20465 SDLoc DL(N);
20466 SDValue ID =
20467 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20468
20469 EVT WhileVT = EVT::getVectorVT(
20470 *DAG.getContext(), MVT::i1,
20472
20473 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20474 EVT PromVT = getPromotedVTForPredicate(WhileVT);
20475
20476 // Get the fixed-width equivalent of PromVT for extraction.
20477 EVT ExtVT =
20480
20481 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
20482 N->getOperand(1), N->getOperand(2));
20483 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
20484 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20485 DAG.getConstant(0, DL, MVT::i64));
20486 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
20487 }
20488 return Res;
20489 }
20490 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20491 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20492 return tryCombineFixedPointConvert(N, DCI, DAG);
20493 case Intrinsic::aarch64_neon_saddv:
20495 case Intrinsic::aarch64_neon_uaddv:
20497 case Intrinsic::aarch64_neon_sminv:
20499 case Intrinsic::aarch64_neon_uminv:
20501 case Intrinsic::aarch64_neon_smaxv:
20503 case Intrinsic::aarch64_neon_umaxv:
20505 case Intrinsic::aarch64_neon_fmax:
20506 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20507 N->getOperand(1), N->getOperand(2));
20508 case Intrinsic::aarch64_neon_fmin:
20509 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20510 N->getOperand(1), N->getOperand(2));
20511 case Intrinsic::aarch64_neon_fmaxnm:
20512 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20513 N->getOperand(1), N->getOperand(2));
20514 case Intrinsic::aarch64_neon_fminnm:
20515 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20516 N->getOperand(1), N->getOperand(2));
20517 case Intrinsic::aarch64_neon_smull:
20518 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20519 N->getOperand(1), N->getOperand(2));
20520 case Intrinsic::aarch64_neon_umull:
20521 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20522 N->getOperand(1), N->getOperand(2));
20523 case Intrinsic::aarch64_neon_pmull:
20524 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20525 N->getOperand(1), N->getOperand(2));
20526 case Intrinsic::aarch64_neon_sqdmull:
20527 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20528 case Intrinsic::aarch64_neon_sqshl:
20529 case Intrinsic::aarch64_neon_uqshl:
20530 case Intrinsic::aarch64_neon_sqshlu:
20531 case Intrinsic::aarch64_neon_srshl:
20532 case Intrinsic::aarch64_neon_urshl:
20533 case Intrinsic::aarch64_neon_sshl:
20534 case Intrinsic::aarch64_neon_ushl:
20535 return tryCombineShiftImm(IID, N, DAG);
20536 case Intrinsic::aarch64_neon_sabd:
20537 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20538 N->getOperand(1), N->getOperand(2));
20539 case Intrinsic::aarch64_neon_uabd:
20540 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20541 N->getOperand(1), N->getOperand(2));
20542 case Intrinsic::aarch64_crc32b:
20543 case Intrinsic::aarch64_crc32cb:
20544 return tryCombineCRC32(0xff, N, DAG);
20545 case Intrinsic::aarch64_crc32h:
20546 case Intrinsic::aarch64_crc32ch:
20547 return tryCombineCRC32(0xffff, N, DAG);
20548 case Intrinsic::aarch64_sve_saddv:
20549 // There is no i64 version of SADDV because the sign is irrelevant.
20550 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20552 else
20554 case Intrinsic::aarch64_sve_uaddv:
20556 case Intrinsic::aarch64_sve_smaxv:
20558 case Intrinsic::aarch64_sve_umaxv:
20560 case Intrinsic::aarch64_sve_sminv:
20562 case Intrinsic::aarch64_sve_uminv:
20564 case Intrinsic::aarch64_sve_orv:
20566 case Intrinsic::aarch64_sve_eorv:
20568 case Intrinsic::aarch64_sve_andv:
20570 case Intrinsic::aarch64_sve_index:
20571 return LowerSVEIntrinsicIndex(N, DAG);
20572 case Intrinsic::aarch64_sve_dup:
20573 return LowerSVEIntrinsicDUP(N, DAG);
20574 case Intrinsic::aarch64_sve_dup_x:
20575 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20576 N->getOperand(1));
20577 case Intrinsic::aarch64_sve_ext:
20578 return LowerSVEIntrinsicEXT(N, DAG);
20579 case Intrinsic::aarch64_sve_mul_u:
20580 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20581 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20582 case Intrinsic::aarch64_sve_smulh_u:
20583 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20584 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20585 case Intrinsic::aarch64_sve_umulh_u:
20586 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20587 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20588 case Intrinsic::aarch64_sve_smin_u:
20589 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20590 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20591 case Intrinsic::aarch64_sve_umin_u:
20592 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20593 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20594 case Intrinsic::aarch64_sve_smax_u:
20595 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20596 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20597 case Intrinsic::aarch64_sve_umax_u:
20598 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20599 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20600 case Intrinsic::aarch64_sve_lsl_u:
20601 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20602 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20603 case Intrinsic::aarch64_sve_lsr_u:
20604 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20605 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20606 case Intrinsic::aarch64_sve_asr_u:
20607 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20608 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20609 case Intrinsic::aarch64_sve_fadd_u:
20610 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20611 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20612 case Intrinsic::aarch64_sve_fdiv_u:
20613 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20614 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20615 case Intrinsic::aarch64_sve_fmax_u:
20616 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20617 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20618 case Intrinsic::aarch64_sve_fmaxnm_u:
20619 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20620 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20621 case Intrinsic::aarch64_sve_fmla_u:
20622 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20623 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20624 N->getOperand(2));
20625 case Intrinsic::aarch64_sve_fmin_u:
20626 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20627 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20628 case Intrinsic::aarch64_sve_fminnm_u:
20629 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20630 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20631 case Intrinsic::aarch64_sve_fmul_u:
20632 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20633 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20634 case Intrinsic::aarch64_sve_fsub_u:
20635 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20636 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20637 case Intrinsic::aarch64_sve_add_u:
20638 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20639 N->getOperand(3));
20640 case Intrinsic::aarch64_sve_sub_u:
20641 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20642 N->getOperand(3));
20643 case Intrinsic::aarch64_sve_subr:
20644 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20645 case Intrinsic::aarch64_sve_and_u:
20646 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20647 N->getOperand(3));
20648 case Intrinsic::aarch64_sve_bic_u:
20649 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20650 N->getOperand(2), N->getOperand(3));
20651 case Intrinsic::aarch64_sve_eor_u:
20652 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20653 N->getOperand(3));
20654 case Intrinsic::aarch64_sve_orr_u:
20655 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20656 N->getOperand(3));
20657 case Intrinsic::aarch64_sve_sabd_u:
20658 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20659 N->getOperand(2), N->getOperand(3));
20660 case Intrinsic::aarch64_sve_uabd_u:
20661 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20662 N->getOperand(2), N->getOperand(3));
20663 case Intrinsic::aarch64_sve_sdiv_u:
20664 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20665 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20666 case Intrinsic::aarch64_sve_udiv_u:
20667 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20668 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20669 case Intrinsic::aarch64_sve_sqadd:
20670 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20671 case Intrinsic::aarch64_sve_sqsub_u:
20672 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20673 N->getOperand(2), N->getOperand(3));
20674 case Intrinsic::aarch64_sve_uqadd:
20675 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20676 case Intrinsic::aarch64_sve_uqsub_u:
20677 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20678 N->getOperand(2), N->getOperand(3));
20679 case Intrinsic::aarch64_sve_sqadd_x:
20680 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20681 N->getOperand(1), N->getOperand(2));
20682 case Intrinsic::aarch64_sve_sqsub_x:
20683 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20684 N->getOperand(1), N->getOperand(2));
20685 case Intrinsic::aarch64_sve_uqadd_x:
20686 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20687 N->getOperand(1), N->getOperand(2));
20688 case Intrinsic::aarch64_sve_uqsub_x:
20689 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20690 N->getOperand(1), N->getOperand(2));
20691 case Intrinsic::aarch64_sve_asrd:
20692 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20693 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20694 case Intrinsic::aarch64_sve_cmphs:
20695 if (!N->getOperand(2).getValueType().isFloatingPoint())
20697 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20698 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20699 break;
20700 case Intrinsic::aarch64_sve_cmphi:
20701 if (!N->getOperand(2).getValueType().isFloatingPoint())
20703 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20704 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20705 break;
20706 case Intrinsic::aarch64_sve_fcmpge:
20707 case Intrinsic::aarch64_sve_cmpge:
20709 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20710 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20711 break;
20712 case Intrinsic::aarch64_sve_fcmpgt:
20713 case Intrinsic::aarch64_sve_cmpgt:
20715 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20716 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20717 break;
20718 case Intrinsic::aarch64_sve_fcmpeq:
20719 case Intrinsic::aarch64_sve_cmpeq:
20721 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20722 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20723 break;
20724 case Intrinsic::aarch64_sve_fcmpne:
20725 case Intrinsic::aarch64_sve_cmpne:
20727 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20728 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20729 break;
20730 case Intrinsic::aarch64_sve_fcmpuo:
20732 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20733 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20734 break;
20735 case Intrinsic::aarch64_sve_fadda:
20737 case Intrinsic::aarch64_sve_faddv:
20739 case Intrinsic::aarch64_sve_fmaxnmv:
20741 case Intrinsic::aarch64_sve_fmaxv:
20743 case Intrinsic::aarch64_sve_fminnmv:
20745 case Intrinsic::aarch64_sve_fminv:
20747 case Intrinsic::aarch64_sve_sel:
20748 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20749 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20750 case Intrinsic::aarch64_sve_cmpeq_wide:
20751 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20752 case Intrinsic::aarch64_sve_cmpne_wide:
20753 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20754 case Intrinsic::aarch64_sve_cmpge_wide:
20755 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20756 case Intrinsic::aarch64_sve_cmpgt_wide:
20757 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20758 case Intrinsic::aarch64_sve_cmplt_wide:
20759 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20760 case Intrinsic::aarch64_sve_cmple_wide:
20761 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20762 case Intrinsic::aarch64_sve_cmphs_wide:
20763 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20764 case Intrinsic::aarch64_sve_cmphi_wide:
20765 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20766 case Intrinsic::aarch64_sve_cmplo_wide:
20767 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20768 case Intrinsic::aarch64_sve_cmpls_wide:
20769 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20770 case Intrinsic::aarch64_sve_ptest_any:
20771 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20773 case Intrinsic::aarch64_sve_ptest_first:
20774 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20776 case Intrinsic::aarch64_sve_ptest_last:
20777 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20779 }
20780 return SDValue();
20781}
20782
20783static bool isCheapToExtend(const SDValue &N) {
20784 unsigned OC = N->getOpcode();
20785 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20787}
20788
20789static SDValue
20791 SelectionDAG &DAG) {
20792 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20793 // we can move the sext into the arguments and have the same result. For
20794 // example, if A and B are both loads, we can make those extending loads and
20795 // avoid an extra instruction. This pattern appears often in VLS code
20796 // generation where the inputs to the setcc have a different size to the
20797 // instruction that wants to use the result of the setcc.
20798 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20799 N->getOperand(0)->getOpcode() == ISD::SETCC);
20800 const SDValue SetCC = N->getOperand(0);
20801
20802 const SDValue CCOp0 = SetCC.getOperand(0);
20803 const SDValue CCOp1 = SetCC.getOperand(1);
20804 if (!CCOp0->getValueType(0).isInteger() ||
20805 !CCOp1->getValueType(0).isInteger())
20806 return SDValue();
20807
20808 ISD::CondCode Code =
20809 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20810
20811 ISD::NodeType ExtType =
20812 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20813
20814 if (isCheapToExtend(SetCC.getOperand(0)) &&
20815 isCheapToExtend(SetCC.getOperand(1))) {
20816 const SDValue Ext1 =
20817 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20818 const SDValue Ext2 =
20819 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20820
20821 return DAG.getSetCC(
20822 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20823 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20824 }
20825
20826 return SDValue();
20827}
20828
20831 SelectionDAG &DAG) {
20832 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20833 // we can convert that DUP into another extract_high (of a bigger DUP), which
20834 // helps the backend to decide that an sabdl2 would be useful, saving a real
20835 // extract_high operation.
20836 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20837 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20838 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20839 SDNode *ABDNode = N->getOperand(0).getNode();
20840 SDValue NewABD =
20842 if (!NewABD.getNode())
20843 return SDValue();
20844
20845 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20846 }
20847
20848 if (N->getValueType(0).isFixedLengthVector() &&
20849 N->getOpcode() == ISD::SIGN_EXTEND &&
20850 N->getOperand(0)->getOpcode() == ISD::SETCC)
20851 return performSignExtendSetCCCombine(N, DCI, DAG);
20852
20853 return SDValue();
20854}
20855
20857 SDValue SplatVal, unsigned NumVecElts) {
20858 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20859 Align OrigAlignment = St.getAlign();
20860 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20861
20862 // Create scalar stores. This is at least as good as the code sequence for a
20863 // split unaligned store which is a dup.s, ext.b, and two stores.
20864 // Most of the time the three stores should be replaced by store pair
20865 // instructions (stp).
20866 SDLoc DL(&St);
20867 SDValue BasePtr = St.getBasePtr();
20868 uint64_t BaseOffset = 0;
20869
20870 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20871 SDValue NewST1 =
20872 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20873 OrigAlignment, St.getMemOperand()->getFlags());
20874
20875 // As this in ISel, we will not merge this add which may degrade results.
20876 if (BasePtr->getOpcode() == ISD::ADD &&
20877 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20878 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20879 BasePtr = BasePtr->getOperand(0);
20880 }
20881
20882 unsigned Offset = EltOffset;
20883 while (--NumVecElts) {
20884 Align Alignment = commonAlignment(OrigAlignment, Offset);
20885 SDValue OffsetPtr =
20886 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20887 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20888 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20889 PtrInfo.getWithOffset(Offset), Alignment,
20890 St.getMemOperand()->getFlags());
20891 Offset += EltOffset;
20892 }
20893 return NewST1;
20894}
20895
20896// Returns an SVE type that ContentTy can be trivially sign or zero extended
20897// into.
20898static MVT getSVEContainerType(EVT ContentTy) {
20899 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20900
20901 switch (ContentTy.getSimpleVT().SimpleTy) {
20902 default:
20903 llvm_unreachable("No known SVE container for this MVT type");
20904 case MVT::nxv2i8:
20905 case MVT::nxv2i16:
20906 case MVT::nxv2i32:
20907 case MVT::nxv2i64:
20908 case MVT::nxv2f32:
20909 case MVT::nxv2f64:
20910 return MVT::nxv2i64;
20911 case MVT::nxv4i8:
20912 case MVT::nxv4i16:
20913 case MVT::nxv4i32:
20914 case MVT::nxv4f32:
20915 return MVT::nxv4i32;
20916 case MVT::nxv8i8:
20917 case MVT::nxv8i16:
20918 case MVT::nxv8f16:
20919 case MVT::nxv8bf16:
20920 return MVT::nxv8i16;
20921 case MVT::nxv16i8:
20922 return MVT::nxv16i8;
20923 }
20924}
20925
20926static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20927 SDLoc DL(N);
20928 EVT VT = N->getValueType(0);
20929
20931 return SDValue();
20932
20933 EVT ContainerVT = VT;
20934 if (ContainerVT.isInteger())
20935 ContainerVT = getSVEContainerType(ContainerVT);
20936
20937 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20938 SDValue Ops[] = { N->getOperand(0), // Chain
20939 N->getOperand(2), // Pg
20940 N->getOperand(3), // Base
20941 DAG.getValueType(VT) };
20942
20943 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
20944 SDValue LoadChain = SDValue(Load.getNode(), 1);
20945
20946 if (ContainerVT.isInteger() && (VT != ContainerVT))
20947 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
20948
20949 return DAG.getMergeValues({ Load, LoadChain }, DL);
20950}
20951
20953 SDLoc DL(N);
20954 EVT VT = N->getValueType(0);
20955 EVT PtrTy = N->getOperand(3).getValueType();
20956
20957 EVT LoadVT = VT;
20958 if (VT.isFloatingPoint())
20959 LoadVT = VT.changeTypeToInteger();
20960
20961 auto *MINode = cast<MemIntrinsicSDNode>(N);
20962 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
20963 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
20964 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
20965 MINode->getOperand(2), PassThru,
20966 MINode->getMemoryVT(), MINode->getMemOperand(),
20968
20969 if (VT.isFloatingPoint()) {
20970 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
20971 return DAG.getMergeValues(Ops, DL);
20972 }
20973
20974 return L;
20975}
20976
20977template <unsigned Opcode>
20979 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
20981 "Unsupported opcode.");
20982 SDLoc DL(N);
20983 EVT VT = N->getValueType(0);
20984
20985 EVT LoadVT = VT;
20986 if (VT.isFloatingPoint())
20987 LoadVT = VT.changeTypeToInteger();
20988
20989 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
20990 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
20991 SDValue LoadChain = SDValue(Load.getNode(), 1);
20992
20993 if (VT.isFloatingPoint())
20994 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
20995
20996 return DAG.getMergeValues({Load, LoadChain}, DL);
20997}
20998
21000 SDLoc DL(N);
21001 SDValue Data = N->getOperand(2);
21002 EVT DataVT = Data.getValueType();
21003 EVT HwSrcVt = getSVEContainerType(DataVT);
21004 SDValue InputVT = DAG.getValueType(DataVT);
21005
21006 if (DataVT.isFloatingPoint())
21007 InputVT = DAG.getValueType(HwSrcVt);
21008
21009 SDValue SrcNew;
21010 if (Data.getValueType().isFloatingPoint())
21011 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21012 else
21013 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21014
21015 SDValue Ops[] = { N->getOperand(0), // Chain
21016 SrcNew,
21017 N->getOperand(4), // Base
21018 N->getOperand(3), // Pg
21019 InputVT
21020 };
21021
21022 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21023}
21024
21026 SDLoc DL(N);
21027
21028 SDValue Data = N->getOperand(2);
21029 EVT DataVT = Data.getValueType();
21030 EVT PtrTy = N->getOperand(4).getValueType();
21031
21032 if (DataVT.isFloatingPoint())
21034
21035 auto *MINode = cast<MemIntrinsicSDNode>(N);
21036 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21037 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21038 MINode->getMemoryVT(), MINode->getMemOperand(),
21039 ISD::UNINDEXED, false, false);
21040}
21041
21042/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21043/// load store optimizer pass will merge them to store pair stores. This should
21044/// be better than a movi to create the vector zero followed by a vector store
21045/// if the zero constant is not re-used, since one instructions and one register
21046/// live range will be removed.
21047///
21048/// For example, the final generated code should be:
21049///
21050/// stp xzr, xzr, [x0]
21051///
21052/// instead of:
21053///
21054/// movi v0.2d, #0
21055/// str q0, [x0]
21056///
21058 SDValue StVal = St.getValue();
21059 EVT VT = StVal.getValueType();
21060
21061 // Avoid scalarizing zero splat stores for scalable vectors.
21062 if (VT.isScalableVector())
21063 return SDValue();
21064
21065 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21066 // 2, 3 or 4 i32 elements.
21067 int NumVecElts = VT.getVectorNumElements();
21068 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21069 VT.getVectorElementType().getSizeInBits() == 64) ||
21070 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21071 VT.getVectorElementType().getSizeInBits() == 32)))
21072 return SDValue();
21073
21074 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21075 return SDValue();
21076
21077 // If the zero constant has more than one use then the vector store could be
21078 // better since the constant mov will be amortized and stp q instructions
21079 // should be able to be formed.
21080 if (!StVal.hasOneUse())
21081 return SDValue();
21082
21083 // If the store is truncating then it's going down to i16 or smaller, which
21084 // means it can be implemented in a single store anyway.
21085 if (St.isTruncatingStore())
21086 return SDValue();
21087
21088 // If the immediate offset of the address operand is too large for the stp
21089 // instruction, then bail out.
21090 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21091 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21092 if (Offset < -512 || Offset > 504)
21093 return SDValue();
21094 }
21095
21096 for (int I = 0; I < NumVecElts; ++I) {
21097 SDValue EltVal = StVal.getOperand(I);
21098 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21099 return SDValue();
21100 }
21101
21102 // Use a CopyFromReg WZR/XZR here to prevent
21103 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21104 SDLoc DL(&St);
21105 unsigned ZeroReg;
21106 EVT ZeroVT;
21107 if (VT.getVectorElementType().getSizeInBits() == 32) {
21108 ZeroReg = AArch64::WZR;
21109 ZeroVT = MVT::i32;
21110 } else {
21111 ZeroReg = AArch64::XZR;
21112 ZeroVT = MVT::i64;
21113 }
21114 SDValue SplatVal =
21115 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21116 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21117}
21118
21119/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21120/// value. The load store optimizer pass will merge them to store pair stores.
21121/// This has better performance than a splat of the scalar followed by a split
21122/// vector store. Even if the stores are not merged it is four stores vs a dup,
21123/// followed by an ext.b and two stores.
21125 SDValue StVal = St.getValue();
21126 EVT VT = StVal.getValueType();
21127
21128 // Don't replace floating point stores, they possibly won't be transformed to
21129 // stp because of the store pair suppress pass.
21130 if (VT.isFloatingPoint())
21131 return SDValue();
21132
21133 // We can express a splat as store pair(s) for 2 or 4 elements.
21134 unsigned NumVecElts = VT.getVectorNumElements();
21135 if (NumVecElts != 4 && NumVecElts != 2)
21136 return SDValue();
21137
21138 // If the store is truncating then it's going down to i16 or smaller, which
21139 // means it can be implemented in a single store anyway.
21140 if (St.isTruncatingStore())
21141 return SDValue();
21142
21143 // Check that this is a splat.
21144 // Make sure that each of the relevant vector element locations are inserted
21145 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21146 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21147 SDValue SplatVal;
21148 for (unsigned I = 0; I < NumVecElts; ++I) {
21149 // Check for insert vector elements.
21150 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21151 return SDValue();
21152
21153 // Check that same value is inserted at each vector element.
21154 if (I == 0)
21155 SplatVal = StVal.getOperand(1);
21156 else if (StVal.getOperand(1) != SplatVal)
21157 return SDValue();
21158
21159 // Check insert element index.
21160 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21161 if (!CIndex)
21162 return SDValue();
21163 uint64_t IndexVal = CIndex->getZExtValue();
21164 if (IndexVal >= NumVecElts)
21165 return SDValue();
21166 IndexNotInserted.reset(IndexVal);
21167
21168 StVal = StVal.getOperand(0);
21169 }
21170 // Check that all vector element locations were inserted to.
21171 if (IndexNotInserted.any())
21172 return SDValue();
21173
21174 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21175}
21176
21178 SelectionDAG &DAG,
21179 const AArch64Subtarget *Subtarget) {
21180
21181 StoreSDNode *S = cast<StoreSDNode>(N);
21182 if (S->isVolatile() || S->isIndexed())
21183 return SDValue();
21184
21185 SDValue StVal = S->getValue();
21186 EVT VT = StVal.getValueType();
21187
21188 if (!VT.isFixedLengthVector())
21189 return SDValue();
21190
21191 // If we get a splat of zeros, convert this vector store to a store of
21192 // scalars. They will be merged into store pairs of xzr thereby removing one
21193 // instruction and one register.
21194 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21195 return ReplacedZeroSplat;
21196
21197 // FIXME: The logic for deciding if an unaligned store should be split should
21198 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21199 // a call to that function here.
21200
21201 if (!Subtarget->isMisaligned128StoreSlow())
21202 return SDValue();
21203
21204 // Don't split at -Oz.
21206 return SDValue();
21207
21208 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21209 // those up regresses performance on micro-benchmarks and olden/bh.
21210 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21211 return SDValue();
21212
21213 // Split unaligned 16B stores. They are terrible for performance.
21214 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21215 // extensions can use this to mark that it does not want splitting to happen
21216 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21217 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21218 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21219 S->getAlign() <= Align(2))
21220 return SDValue();
21221
21222 // If we get a splat of a scalar convert this vector store to a store of
21223 // scalars. They will be merged into store pairs thereby removing two
21224 // instructions.
21225 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21226 return ReplacedSplat;
21227
21228 SDLoc DL(S);
21229
21230 // Split VT into two.
21231 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21232 unsigned NumElts = HalfVT.getVectorNumElements();
21233 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21234 DAG.getConstant(0, DL, MVT::i64));
21235 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21236 DAG.getConstant(NumElts, DL, MVT::i64));
21237 SDValue BasePtr = S->getBasePtr();
21238 SDValue NewST1 =
21239 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21240 S->getAlign(), S->getMemOperand()->getFlags());
21241 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21242 DAG.getConstant(8, DL, MVT::i64));
21243 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21244 S->getPointerInfo(), S->getAlign(),
21245 S->getMemOperand()->getFlags());
21246}
21247
21249 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21250
21251 // splice(pg, op1, undef) -> op1
21252 if (N->getOperand(2).isUndef())
21253 return N->getOperand(1);
21254
21255 return SDValue();
21256}
21257
21259 const AArch64Subtarget *Subtarget) {
21260 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21261 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21262 "Unexpected Opcode!");
21263
21264 // uunpklo/hi undef -> undef
21265 if (N->getOperand(0).isUndef())
21266 return DAG.getUNDEF(N->getValueType(0));
21267
21268 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21269 // extending load. We can do this even if this is already a masked
21270 // {z,}extload.
21271 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21272 N->getOpcode() == AArch64ISD::UUNPKLO) {
21273 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21274 SDValue Mask = MLD->getMask();
21275 SDLoc DL(N);
21276
21277 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21278 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21279 (MLD->getPassThru()->isUndef() ||
21280 isZerosVector(MLD->getPassThru().getNode()))) {
21281 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21282 unsigned PgPattern = Mask->getConstantOperandVal(0);
21283 EVT VT = N->getValueType(0);
21284
21285 // Ensure we can double the size of the predicate pattern
21286 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21287 if (NumElts &&
21288 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21289 Mask =
21290 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21291 SDValue PassThru = DAG.getConstant(0, DL, VT);
21292 SDValue NewLoad = DAG.getMaskedLoad(
21293 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21294 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21296
21297 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21298
21299 return NewLoad;
21300 }
21301 }
21302 }
21303
21304 return SDValue();
21305}
21306
21308 if (N->getOpcode() != AArch64ISD::UZP1)
21309 return false;
21310 SDValue Op0 = N->getOperand(0);
21311 EVT SrcVT = Op0->getValueType(0);
21312 EVT DstVT = N->getValueType(0);
21313 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21314 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21315 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21316}
21317
21318// Try to combine rounding shifts where the operands come from an extend, and
21319// the result is truncated and combined into one vector.
21320// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21322 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21323 SDValue Op0 = N->getOperand(0);
21324 SDValue Op1 = N->getOperand(1);
21325 EVT ResVT = N->getValueType(0);
21326
21327 unsigned RshOpc = Op0.getOpcode();
21328 if (RshOpc != AArch64ISD::RSHRNB_I)
21329 return SDValue();
21330
21331 // Same op code and imm value?
21332 SDValue ShiftValue = Op0.getOperand(1);
21333 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21334 return SDValue();
21335
21336 // Same unextended operand value?
21337 SDValue Lo = Op0.getOperand(0);
21338 SDValue Hi = Op1.getOperand(0);
21339 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21340 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21341 return SDValue();
21342 SDValue OrigArg = Lo.getOperand(0);
21343 if (OrigArg != Hi.getOperand(0))
21344 return SDValue();
21345
21346 SDLoc DL(N);
21347 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21348 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21349 ShiftValue);
21350}
21351
21352// Try to simplify:
21353// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21354// t2 = nxv8i16 srl(t1, ShiftValue)
21355// to
21356// t1 = nxv8i16 rshrnb(X, shiftvalue).
21357// rshrnb will zero the top half bits of each element. Therefore, this combine
21358// should only be performed when a following instruction with the rshrnb
21359// as an operand does not care about the top half of each element. For example,
21360// a uzp1 or a truncating store.
21362 const AArch64Subtarget *Subtarget) {
21363 EVT VT = Srl->getValueType(0);
21364 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21365 return SDValue();
21366
21367 EVT ResVT;
21368 if (VT == MVT::nxv8i16)
21369 ResVT = MVT::nxv16i8;
21370 else if (VT == MVT::nxv4i32)
21371 ResVT = MVT::nxv8i16;
21372 else if (VT == MVT::nxv2i64)
21373 ResVT = MVT::nxv4i32;
21374 else
21375 return SDValue();
21376
21377 SDLoc DL(Srl);
21378 unsigned ShiftValue;
21379 SDValue RShOperand;
21380 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21381 return SDValue();
21382 SDValue Rshrnb = DAG.getNode(
21383 AArch64ISD::RSHRNB_I, DL, ResVT,
21384 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21385 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21386}
21387
21389 const AArch64Subtarget *Subtarget) {
21390 SDLoc DL(N);
21391 SDValue Op0 = N->getOperand(0);
21392 SDValue Op1 = N->getOperand(1);
21393 EVT ResVT = N->getValueType(0);
21394
21395 // uzp1(x, undef) -> concat(truncate(x), undef)
21396 if (Op1.getOpcode() == ISD::UNDEF) {
21397 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21398 switch (ResVT.getSimpleVT().SimpleTy) {
21399 default:
21400 break;
21401 case MVT::v16i8:
21402 BCVT = MVT::v8i16;
21403 HalfVT = MVT::v8i8;
21404 break;
21405 case MVT::v8i16:
21406 BCVT = MVT::v4i32;
21407 HalfVT = MVT::v4i16;
21408 break;
21409 case MVT::v4i32:
21410 BCVT = MVT::v2i64;
21411 HalfVT = MVT::v2i32;
21412 break;
21413 }
21414 if (BCVT != MVT::Other) {
21415 SDValue BC = DAG.getBitcast(BCVT, Op0);
21416 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21417 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21418 DAG.getUNDEF(HalfVT));
21419 }
21420 }
21421
21422 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21423 return Urshr;
21424
21425 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21426 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21427
21428 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21429 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21430
21431 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21432 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21433 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21434 SDValue X = Op0.getOperand(0).getOperand(0);
21435 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21436 }
21437 }
21438
21439 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21440 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21441 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21442 SDValue Z = Op1.getOperand(0).getOperand(1);
21443 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21444 }
21445 }
21446
21447 // These optimizations only work on little endian.
21448 if (!DAG.getDataLayout().isLittleEndian())
21449 return SDValue();
21450
21451 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21452 // Example:
21453 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21454 // to
21455 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21457 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21458 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21459 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21460 Op1.getOperand(0));
21461 }
21462 }
21463
21464 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21465 return SDValue();
21466
21467 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21468 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21469
21470 // truncating uzp1(x, y) -> xtn(concat (x, y))
21471 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21472 EVT Op0Ty = SourceOp0.getValueType();
21473 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21474 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21475 SDValue Concat =
21478 SourceOp0, SourceOp1);
21479 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21480 }
21481 }
21482
21483 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21484 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21485 SourceOp1.getOpcode() != ISD::TRUNCATE)
21486 return SDValue();
21487 SourceOp0 = SourceOp0.getOperand(0);
21488 SourceOp1 = SourceOp1.getOperand(0);
21489
21490 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21491 !SourceOp0.getValueType().isSimple())
21492 return SDValue();
21493
21494 EVT ResultTy;
21495
21496 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21497 case MVT::v2i64:
21498 ResultTy = MVT::v4i32;
21499 break;
21500 case MVT::v4i32:
21501 ResultTy = MVT::v8i16;
21502 break;
21503 case MVT::v8i16:
21504 ResultTy = MVT::v16i8;
21505 break;
21506 default:
21507 return SDValue();
21508 }
21509
21510 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21511 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21512 SDValue UzpResult =
21513 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21514
21515 EVT BitcastResultTy;
21516
21517 switch (ResVT.getSimpleVT().SimpleTy) {
21518 case MVT::v2i32:
21519 BitcastResultTy = MVT::v2i64;
21520 break;
21521 case MVT::v4i16:
21522 BitcastResultTy = MVT::v4i32;
21523 break;
21524 case MVT::v8i8:
21525 BitcastResultTy = MVT::v8i16;
21526 break;
21527 default:
21528 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21529 }
21530
21531 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21532 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21533}
21534
21536 unsigned Opc = N->getOpcode();
21537
21538 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21540 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21542 "Invalid opcode.");
21543
21544 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21546 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21548 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21552
21553 SDLoc DL(N);
21554 SDValue Chain = N->getOperand(0);
21555 SDValue Pg = N->getOperand(1);
21556 SDValue Base = N->getOperand(2);
21557 SDValue Offset = N->getOperand(3);
21558 SDValue Ty = N->getOperand(4);
21559
21560 EVT ResVT = N->getValueType(0);
21561
21562 const auto OffsetOpc = Offset.getOpcode();
21563 const bool OffsetIsZExt =
21565 const bool OffsetIsSExt =
21567
21568 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21569 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21570 SDValue ExtPg = Offset.getOperand(0);
21571 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21572 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21573
21574 // If the predicate for the sign- or zero-extended offset is the
21575 // same as the predicate used for this load and the sign-/zero-extension
21576 // was from a 32-bits...
21577 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21578 SDValue UnextendedOffset = Offset.getOperand(1);
21579
21580 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21581 if (Signed)
21582 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21583
21584 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21585 {Chain, Pg, Base, UnextendedOffset, Ty});
21586 }
21587 }
21588
21589 return SDValue();
21590}
21591
21592/// Optimize a vector shift instruction and its operand if shifted out
21593/// bits are not used.
21595 const AArch64TargetLowering &TLI,
21597 assert(N->getOpcode() == AArch64ISD::VASHR ||
21598 N->getOpcode() == AArch64ISD::VLSHR);
21599
21600 SDValue Op = N->getOperand(0);
21601 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21602
21603 unsigned ShiftImm = N->getConstantOperandVal(1);
21604 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21605
21606 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21607 if (N->getOpcode() == AArch64ISD::VASHR &&
21608 Op.getOpcode() == AArch64ISD::VSHL &&
21609 N->getOperand(1) == Op.getOperand(1))
21610 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21611 return Op.getOperand(0);
21612
21613 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21614 APInt DemandedMask = ~ShiftedOutBits;
21615
21616 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21617 return SDValue(N, 0);
21618
21619 return SDValue();
21620}
21621
21623 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21624 // This transform works in partnership with performSetCCPunpkCombine to
21625 // remove unnecessary transfer of predicates into standard registers and back
21626 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21627 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21628 MVT::i1) {
21629 SDValue CC = N->getOperand(0)->getOperand(0);
21630 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21631 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21632 DAG.getVectorIdxConstant(0, SDLoc(N)));
21633 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21634 }
21635
21636 return SDValue();
21637}
21638
21639/// Target-specific DAG combine function for post-increment LD1 (lane) and
21640/// post-increment LD1R.
21643 bool IsLaneOp) {
21644 if (DCI.isBeforeLegalizeOps())
21645 return SDValue();
21646
21647 SelectionDAG &DAG = DCI.DAG;
21648 EVT VT = N->getValueType(0);
21649
21650 if (!VT.is128BitVector() && !VT.is64BitVector())
21651 return SDValue();
21652
21653 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21654 SDNode *LD = N->getOperand(LoadIdx).getNode();
21655 // If it is not LOAD, can not do such combine.
21656 if (LD->getOpcode() != ISD::LOAD)
21657 return SDValue();
21658
21659 // The vector lane must be a constant in the LD1LANE opcode.
21660 SDValue Lane;
21661 if (IsLaneOp) {
21662 Lane = N->getOperand(2);
21663 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21664 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21665 return SDValue();
21666 }
21667
21668 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21669 EVT MemVT = LoadSDN->getMemoryVT();
21670 // Check if memory operand is the same type as the vector element.
21671 if (MemVT != VT.getVectorElementType())
21672 return SDValue();
21673
21674 // Check if there are other uses. If so, do not combine as it will introduce
21675 // an extra load.
21676 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21677 ++UI) {
21678 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21679 continue;
21680 if (*UI != N)
21681 return SDValue();
21682 }
21683
21684 // If there is one use and it can splat the value, prefer that operation.
21685 // TODO: This could be expanded to more operations if they reliably use the
21686 // index variants.
21687 if (N->hasOneUse()) {
21688 unsigned UseOpc = N->use_begin()->getOpcode();
21689 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21690 return SDValue();
21691 }
21692
21693 SDValue Addr = LD->getOperand(1);
21694 SDValue Vector = N->getOperand(0);
21695 // Search for a use of the address operand that is an increment.
21696 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21697 Addr.getNode()->use_end(); UI != UE; ++UI) {
21698 SDNode *User = *UI;
21699 if (User->getOpcode() != ISD::ADD
21700 || UI.getUse().getResNo() != Addr.getResNo())
21701 continue;
21702
21703 // If the increment is a constant, it must match the memory ref size.
21704 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21705 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21706 uint32_t IncVal = CInc->getZExtValue();
21707 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21708 if (IncVal != NumBytes)
21709 continue;
21710 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21711 }
21712
21713 // To avoid cycle construction make sure that neither the load nor the add
21714 // are predecessors to each other or the Vector.
21717 Visited.insert(Addr.getNode());
21718 Worklist.push_back(User);
21719 Worklist.push_back(LD);
21720 Worklist.push_back(Vector.getNode());
21721 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21722 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21723 continue;
21724
21726 Ops.push_back(LD->getOperand(0)); // Chain
21727 if (IsLaneOp) {
21728 Ops.push_back(Vector); // The vector to be inserted
21729 Ops.push_back(Lane); // The lane to be inserted in the vector
21730 }
21731 Ops.push_back(Addr);
21732 Ops.push_back(Inc);
21733
21734 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21735 SDVTList SDTys = DAG.getVTList(Tys);
21736 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21737 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21738 MemVT,
21739 LoadSDN->getMemOperand());
21740
21741 // Update the uses.
21742 SDValue NewResults[] = {
21743 SDValue(LD, 0), // The result of load
21744 SDValue(UpdN.getNode(), 2) // Chain
21745 };
21746 DCI.CombineTo(LD, NewResults);
21747 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21748 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21749
21750 break;
21751 }
21752 return SDValue();
21753}
21754
21755/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21756/// address translation.
21759 SelectionDAG &DAG) {
21760 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21761 KnownBits Known;
21763 !DCI.isBeforeLegalizeOps());
21764 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21765 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21766 DCI.CommitTargetLoweringOpt(TLO);
21767 return true;
21768 }
21769 return false;
21770}
21771
21773 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21774 "Expected STORE dag node in input!");
21775
21776 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21777 if (!Store->isTruncatingStore() || Store->isIndexed())
21778 return SDValue();
21779 SDValue Ext = Store->getValue();
21780 auto ExtOpCode = Ext.getOpcode();
21781 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21782 ExtOpCode != ISD::ANY_EXTEND)
21783 return SDValue();
21784 SDValue Orig = Ext->getOperand(0);
21785 if (Store->getMemoryVT() != Orig.getValueType())
21786 return SDValue();
21787 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21788 Store->getBasePtr(), Store->getMemOperand());
21789 }
21790
21791 return SDValue();
21792}
21793
21794// A custom combine to lower load <3 x i8> as the more efficient sequence
21795// below:
21796// ldrb wX, [x0, #2]
21797// ldrh wY, [x0]
21798// orr wX, wY, wX, lsl #16
21799// fmov s0, wX
21800//
21801// Note that an alternative sequence with even fewer (although usually more
21802// complex/expensive) instructions would be:
21803// ld1r.4h { v0 }, [x0], #2
21804// ld1.b { v0 }[2], [x0]
21805//
21806// Generating this sequence unfortunately results in noticeably worse codegen
21807// for code that extends the loaded v3i8, due to legalization breaking vector
21808// shuffle detection in a way that is very difficult to work around.
21809// TODO: Revisit once v3i8 legalization has been improved in general.
21811 EVT MemVT = LD->getMemoryVT();
21812 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21813 LD->getOriginalAlign() >= 4)
21814 return SDValue();
21815
21816 SDLoc DL(LD);
21818 SDValue Chain = LD->getChain();
21819 SDValue BasePtr = LD->getBasePtr();
21820 MachineMemOperand *MMO = LD->getMemOperand();
21821 assert(LD->getOffset().isUndef() && "undef offset expected");
21822
21823 // Load 2 x i8, then 1 x i8.
21824 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21825 TypeSize Offset2 = TypeSize::getFixed(2);
21826 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21827 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21828 MF.getMachineMemOperand(MMO, 2, 1));
21829
21830 // Extend to i32.
21831 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21832 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21833
21834 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21835 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21836 DAG.getConstant(16, DL, MVT::i32));
21837 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21838 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21839
21840 // Extract v3i8 again.
21841 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21842 DAG.getConstant(0, DL, MVT::i64));
21843 SDValue TokenFactor = DAG.getNode(
21844 ISD::TokenFactor, DL, MVT::Other,
21845 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21846 return DAG.getMergeValues({Extract, TokenFactor}, DL);
21847}
21848
21849// Perform TBI simplification if supported by the target and try to break up
21850// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21851// load instructions can be selected.
21854 SelectionDAG &DAG,
21855 const AArch64Subtarget *Subtarget) {
21856 if (Subtarget->supportsAddressTopByteIgnored())
21857 performTBISimplification(N->getOperand(1), DCI, DAG);
21858
21859 LoadSDNode *LD = cast<LoadSDNode>(N);
21860 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21861 return SDValue(N, 0);
21862
21863 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21864 return Res;
21865
21866 if (!LD->isNonTemporal())
21867 return SDValue(N, 0);
21868
21869 EVT MemVT = LD->getMemoryVT();
21870 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21871 MemVT.getSizeInBits() % 256 == 0 ||
21872 256 % MemVT.getScalarSizeInBits() != 0)
21873 return SDValue(N, 0);
21874
21875 SDLoc DL(LD);
21876 SDValue Chain = LD->getChain();
21877 SDValue BasePtr = LD->getBasePtr();
21878 SDNodeFlags Flags = LD->getFlags();
21880 SmallVector<SDValue, 4> LoadOpsChain;
21881 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21882 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21883 // loads and reduce the amount of load instructions generated.
21884 MVT NewVT =
21886 256 / MemVT.getVectorElementType().getSizeInBits());
21887 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21888 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21889 for (unsigned I = 0; I < Num256Loads; I++) {
21890 unsigned PtrOffset = I * 32;
21891 SDValue NewPtr = DAG.getMemBasePlusOffset(
21892 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21893 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21894 SDValue NewLoad = DAG.getLoad(
21895 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21896 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21897 LoadOps.push_back(NewLoad);
21898 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21899 }
21900
21901 // Process remaining bits of the load operation.
21902 // This is done by creating an UNDEF vector to match the size of the
21903 // 256-bit loads and inserting the remaining load to it. We extract the
21904 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21905 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21906 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21907 MVT RemainingVT = MVT::getVectorVT(
21909 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21910 SDValue NewPtr = DAG.getMemBasePlusOffset(
21911 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21912 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21913 SDValue RemainingLoad =
21914 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21915 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21916 LD->getMemOperand()->getFlags(), LD->getAAInfo());
21917 SDValue UndefVector = DAG.getUNDEF(NewVT);
21918 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21919 SDValue ExtendedReminingLoad =
21920 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21921 {UndefVector, RemainingLoad, InsertIdx});
21922 LoadOps.push_back(ExtendedReminingLoad);
21923 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
21924 EVT ConcatVT =
21926 LoadOps.size() * NewVT.getVectorNumElements());
21927 SDValue ConcatVectors =
21928 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
21929 // Extract the original vector type size.
21930 SDValue ExtractSubVector =
21931 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
21932 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
21933 SDValue TokenFactor =
21934 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21935 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
21936}
21937
21939 EVT VecVT = Op.getValueType();
21940 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
21941 "Need boolean vector type.");
21942
21943 if (Depth > 3)
21945
21946 // We can get the base type from a vector compare or truncate.
21947 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
21948 return Op.getOperand(0).getValueType();
21949
21950 // If an operand is a bool vector, continue looking.
21952 for (SDValue Operand : Op->op_values()) {
21953 if (Operand.getValueType() != VecVT)
21954 continue;
21955
21956 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
21957 if (!BaseVT.isSimple())
21958 BaseVT = OperandVT;
21959 else if (OperandVT != BaseVT)
21961 }
21962
21963 return BaseVT;
21964}
21965
21966// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
21967// iN, we can use a trick that extracts the i^th bit from the i^th element and
21968// then performs a vector add to get a scalar bitmask. This requires that each
21969// element's bits are either all 1 or all 0.
21971 SDLoc DL(N);
21972 SDValue ComparisonResult(N, 0);
21973 EVT VecVT = ComparisonResult.getValueType();
21974 assert(VecVT.isVector() && "Must be a vector type");
21975
21976 unsigned NumElts = VecVT.getVectorNumElements();
21977 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
21978 return SDValue();
21979
21980 if (VecVT.getVectorElementType() != MVT::i1 &&
21981 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
21982 return SDValue();
21983
21984 // If we can find the original types to work on instead of a vector of i1,
21985 // we can avoid extend/extract conversion instructions.
21986 if (VecVT.getVectorElementType() == MVT::i1) {
21987 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
21988 if (!VecVT.isSimple()) {
21989 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
21990 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
21991 }
21992 }
21993 VecVT = VecVT.changeVectorElementTypeToInteger();
21994
21995 // Large vectors don't map directly to this conversion, so to avoid too many
21996 // edge cases, we don't apply it here. The conversion will likely still be
21997 // applied later via multiple smaller vectors, whose results are concatenated.
21998 if (VecVT.getSizeInBits() > 128)
21999 return SDValue();
22000
22001 // Ensure that all elements' bits are either 0s or 1s.
22002 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22003
22004 SmallVector<SDValue, 16> MaskConstants;
22005 if (VecVT == MVT::v16i8) {
22006 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22007 // per entry. We split it into two halves, apply the mask, zip the halves to
22008 // create 8x 16-bit values, and the perform the vector reduce.
22009 for (unsigned Half = 0; Half < 2; ++Half) {
22010 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22011 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22012 }
22013 }
22014 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22015 SDValue RepresentativeBits =
22016 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22017
22018 SDValue UpperRepresentativeBits =
22019 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22020 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22021 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22022 RepresentativeBits, UpperRepresentativeBits);
22023 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22024 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22025 }
22026
22027 // All other vector sizes.
22028 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22029 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22030 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22031 }
22032
22033 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22034 SDValue RepresentativeBits =
22035 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22036 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22037 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22038 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22039}
22040
22042 StoreSDNode *Store) {
22043 if (!Store->isTruncatingStore())
22044 return SDValue();
22045
22046 SDLoc DL(Store);
22047 SDValue VecOp = Store->getValue();
22048 EVT VT = VecOp.getValueType();
22049 EVT MemVT = Store->getMemoryVT();
22050
22051 if (!MemVT.isVector() || !VT.isVector() ||
22052 MemVT.getVectorElementType() != MVT::i1)
22053 return SDValue();
22054
22055 // If we are storing a vector that we are currently building, let
22056 // `scalarizeVectorStore()` handle this more efficiently.
22057 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22058 return SDValue();
22059
22060 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22061 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22062 if (!VectorBits)
22063 return SDValue();
22064
22065 EVT StoreVT =
22067 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22068 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22069 Store->getMemOperand());
22070}
22071
22073 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22074 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22075 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22076}
22077
22078// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22080 const AArch64Subtarget *Subtarget) {
22081 SDValue Value = ST->getValue();
22082 EVT ValueVT = Value.getValueType();
22083
22084 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22085 Value.getOpcode() != ISD::TRUNCATE ||
22086 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22087 return SDValue();
22088
22089 assert(ST->getOffset().isUndef() && "undef offset expected");
22090 SDLoc DL(ST);
22091 auto WideVT = EVT::getVectorVT(
22092 *DAG.getContext(),
22093 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22094 SDValue UndefVector = DAG.getUNDEF(WideVT);
22095 SDValue WideTrunc = DAG.getNode(
22096 ISD::INSERT_SUBVECTOR, DL, WideVT,
22097 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22098 SDValue Cast = DAG.getNode(
22099 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22100 WideTrunc);
22101
22103 SDValue Chain = ST->getChain();
22104 MachineMemOperand *MMO = ST->getMemOperand();
22105 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22106 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22107 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22108 TypeSize Offset2 = TypeSize::getFixed(2);
22109 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22110 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22111
22112 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22113 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22114 TypeSize Offset1 = TypeSize::getFixed(1);
22115 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22116 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22117
22118 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22119 DAG.getConstant(0, DL, MVT::i64));
22120 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22121 MF.getMachineMemOperand(MMO, 0, 1));
22122 return Chain;
22123}
22124
22127 SelectionDAG &DAG,
22128 const AArch64Subtarget *Subtarget) {
22129 StoreSDNode *ST = cast<StoreSDNode>(N);
22130 SDValue Chain = ST->getChain();
22131 SDValue Value = ST->getValue();
22132 SDValue Ptr = ST->getBasePtr();
22133 EVT ValueVT = Value.getValueType();
22134
22135 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22136 EVT EltVT = VT.getVectorElementType();
22137 return EltVT == MVT::f32 || EltVT == MVT::f64;
22138 };
22139
22140 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22141 return Res;
22142
22143 // If this is an FP_ROUND followed by a store, fold this into a truncating
22144 // store. We can do this even if this is already a truncstore.
22145 // We purposefully don't care about legality of the nodes here as we know
22146 // they can be split down into something legal.
22147 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22148 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22149 Subtarget->useSVEForFixedLengthVectors() &&
22150 ValueVT.isFixedLengthVector() &&
22151 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22152 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22153 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22154 ST->getMemoryVT(), ST->getMemOperand());
22155
22156 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22157 return Split;
22158
22159 if (Subtarget->supportsAddressTopByteIgnored() &&
22160 performTBISimplification(N->getOperand(2), DCI, DAG))
22161 return SDValue(N, 0);
22162
22163 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22164 return Store;
22165
22166 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22167 return Store;
22168
22169 if (ST->isTruncatingStore()) {
22170 EVT StoreVT = ST->getMemoryVT();
22171 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22172 return SDValue();
22173 if (SDValue Rshrnb =
22174 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22175 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22176 StoreVT, ST->getMemOperand());
22177 }
22178 }
22179
22180 return SDValue();
22181}
22182
22185 SelectionDAG &DAG,
22186 const AArch64Subtarget *Subtarget) {
22187 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22188 SDValue Value = MST->getValue();
22189 SDValue Mask = MST->getMask();
22190 SDLoc DL(N);
22191
22192 // If this is a UZP1 followed by a masked store, fold this into a masked
22193 // truncating store. We can do this even if this is already a masked
22194 // truncstore.
22195 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22196 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22197 Value.getValueType().isInteger()) {
22198 Value = Value.getOperand(0);
22199 if (Value.getOpcode() == ISD::BITCAST) {
22200 EVT HalfVT =
22201 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22202 EVT InVT = Value.getOperand(0).getValueType();
22203
22204 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22205 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22206 unsigned PgPattern = Mask->getConstantOperandVal(0);
22207
22208 // Ensure we can double the size of the predicate pattern
22209 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22210 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22211 MinSVESize) {
22212 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22213 PgPattern);
22214 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22215 MST->getBasePtr(), MST->getOffset(), Mask,
22216 MST->getMemoryVT(), MST->getMemOperand(),
22217 MST->getAddressingMode(),
22218 /*IsTruncating=*/true);
22219 }
22220 }
22221 }
22222 }
22223
22224 if (MST->isTruncatingStore()) {
22225 EVT ValueVT = Value->getValueType(0);
22226 EVT MemVT = MST->getMemoryVT();
22227 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22228 return SDValue();
22229 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22230 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22231 MST->getOffset(), MST->getMask(),
22232 MST->getMemoryVT(), MST->getMemOperand(),
22233 MST->getAddressingMode(), true);
22234 }
22235 }
22236
22237 return SDValue();
22238}
22239
22240/// \return true if part of the index was folded into the Base.
22241static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22242 SDLoc DL, SelectionDAG &DAG) {
22243 // This function assumes a vector of i64 indices.
22244 EVT IndexVT = Index.getValueType();
22245 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22246 return false;
22247
22248 // Simplify:
22249 // BasePtr = Ptr
22250 // Index = X + splat(Offset)
22251 // ->
22252 // BasePtr = Ptr + Offset * scale.
22253 // Index = X
22254 if (Index.getOpcode() == ISD::ADD) {
22255 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22256 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22257 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22258 Index = Index.getOperand(0);
22259 return true;
22260 }
22261 }
22262
22263 // Simplify:
22264 // BasePtr = Ptr
22265 // Index = (X + splat(Offset)) << splat(Shift)
22266 // ->
22267 // BasePtr = Ptr + (Offset << Shift) * scale)
22268 // Index = X << splat(shift)
22269 if (Index.getOpcode() == ISD::SHL &&
22270 Index.getOperand(0).getOpcode() == ISD::ADD) {
22271 SDValue Add = Index.getOperand(0);
22272 SDValue ShiftOp = Index.getOperand(1);
22273 SDValue OffsetOp = Add.getOperand(1);
22274 if (auto Shift = DAG.getSplatValue(ShiftOp))
22275 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22276 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22277 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22278 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22279 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22280 Add.getOperand(0), ShiftOp);
22281 return true;
22282 }
22283 }
22284
22285 return false;
22286}
22287
22288// Analyse the specified address returning true if a more optimal addressing
22289// mode is available. When returning true all parameters are updated to reflect
22290// their recommended values.
22292 SDValue &BasePtr, SDValue &Index,
22293 SelectionDAG &DAG) {
22294 // Try to iteratively fold parts of the index into the base pointer to
22295 // simplify the index as much as possible.
22296 bool Changed = false;
22297 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22298 Changed = true;
22299
22300 // Only consider element types that are pointer sized as smaller types can
22301 // be easily promoted.
22302 EVT IndexVT = Index.getValueType();
22303 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22304 return Changed;
22305
22306 // Can indices be trivially shrunk?
22307 EVT DataVT = N->getOperand(1).getValueType();
22308 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22309 // will later be re-extended to 64 bits in legalization
22310 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22311 return Changed;
22312 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22313 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22314 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22315 return true;
22316 }
22317
22318 // Match:
22319 // Index = step(const)
22320 int64_t Stride = 0;
22321 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22322 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22323 }
22324 // Match:
22325 // Index = step(const) << shift(const)
22326 else if (Index.getOpcode() == ISD::SHL &&
22327 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22328 SDValue RHS = Index.getOperand(1);
22329 if (auto *Shift =
22330 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22331 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22332 Stride = Step << Shift->getZExtValue();
22333 }
22334 }
22335
22336 // Return early because no supported pattern is found.
22337 if (Stride == 0)
22338 return Changed;
22339
22340 if (Stride < std::numeric_limits<int32_t>::min() ||
22341 Stride > std::numeric_limits<int32_t>::max())
22342 return Changed;
22343
22344 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22345 unsigned MaxVScale =
22347 int64_t LastElementOffset =
22348 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22349
22350 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22351 LastElementOffset > std::numeric_limits<int32_t>::max())
22352 return Changed;
22353
22354 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22355 // Stride does not scale explicitly by 'Scale', because it happens in
22356 // the gather/scatter addressing mode.
22357 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22358 return true;
22359}
22360
22363 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22364 assert(MGS && "Can only combine gather load or scatter store nodes");
22365
22366 if (!DCI.isBeforeLegalize())
22367 return SDValue();
22368
22369 SDLoc DL(MGS);
22370 SDValue Chain = MGS->getChain();
22371 SDValue Scale = MGS->getScale();
22372 SDValue Index = MGS->getIndex();
22373 SDValue Mask = MGS->getMask();
22374 SDValue BasePtr = MGS->getBasePtr();
22375 ISD::MemIndexType IndexType = MGS->getIndexType();
22376
22377 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22378 return SDValue();
22379
22380 // Here we catch such cases early and change MGATHER's IndexType to allow
22381 // the use of an Index that's more legalisation friendly.
22382 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22383 SDValue PassThru = MGT->getPassThru();
22384 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22385 return DAG.getMaskedGather(
22386 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22387 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22388 }
22389 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22390 SDValue Data = MSC->getValue();
22391 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22392 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22393 Ops, MSC->getMemOperand(), IndexType,
22394 MSC->isTruncatingStore());
22395}
22396
22397/// Target-specific DAG combine function for NEON load/store intrinsics
22398/// to merge base address updates.
22401 SelectionDAG &DAG) {
22402 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22403 return SDValue();
22404
22405 unsigned AddrOpIdx = N->getNumOperands() - 1;
22406 SDValue Addr = N->getOperand(AddrOpIdx);
22407
22408 // Search for a use of the address operand that is an increment.
22409 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22410 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22411 SDNode *User = *UI;
22412 if (User->getOpcode() != ISD::ADD ||
22413 UI.getUse().getResNo() != Addr.getResNo())
22414 continue;
22415
22416 // Check that the add is independent of the load/store. Otherwise, folding
22417 // it would create a cycle.
22420 Visited.insert(Addr.getNode());
22421 Worklist.push_back(N);
22422 Worklist.push_back(User);
22423 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22424 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22425 continue;
22426
22427 // Find the new opcode for the updating load/store.
22428 bool IsStore = false;
22429 bool IsLaneOp = false;
22430 bool IsDupOp = false;
22431 unsigned NewOpc = 0;
22432 unsigned NumVecs = 0;
22433 unsigned IntNo = N->getConstantOperandVal(1);
22434 switch (IntNo) {
22435 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22436 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22437 NumVecs = 2; break;
22438 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22439 NumVecs = 3; break;
22440 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22441 NumVecs = 4; break;
22442 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22443 NumVecs = 2; IsStore = true; break;
22444 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22445 NumVecs = 3; IsStore = true; break;
22446 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22447 NumVecs = 4; IsStore = true; break;
22448 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22449 NumVecs = 2; break;
22450 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22451 NumVecs = 3; break;
22452 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22453 NumVecs = 4; break;
22454 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22455 NumVecs = 2; IsStore = true; break;
22456 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22457 NumVecs = 3; IsStore = true; break;
22458 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22459 NumVecs = 4; IsStore = true; break;
22460 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22461 NumVecs = 2; IsDupOp = true; break;
22462 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22463 NumVecs = 3; IsDupOp = true; break;
22464 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22465 NumVecs = 4; IsDupOp = true; break;
22466 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22467 NumVecs = 2; IsLaneOp = true; break;
22468 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22469 NumVecs = 3; IsLaneOp = true; break;
22470 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22471 NumVecs = 4; IsLaneOp = true; break;
22472 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22473 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22474 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22475 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22476 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22477 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22478 }
22479
22480 EVT VecTy;
22481 if (IsStore)
22482 VecTy = N->getOperand(2).getValueType();
22483 else
22484 VecTy = N->getValueType(0);
22485
22486 // If the increment is a constant, it must match the memory ref size.
22487 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22488 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22489 uint32_t IncVal = CInc->getZExtValue();
22490 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22491 if (IsLaneOp || IsDupOp)
22492 NumBytes /= VecTy.getVectorNumElements();
22493 if (IncVal != NumBytes)
22494 continue;
22495 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22496 }
22498 Ops.push_back(N->getOperand(0)); // Incoming chain
22499 // Load lane and store have vector list as input.
22500 if (IsLaneOp || IsStore)
22501 for (unsigned i = 2; i < AddrOpIdx; ++i)
22502 Ops.push_back(N->getOperand(i));
22503 Ops.push_back(Addr); // Base register
22504 Ops.push_back(Inc);
22505
22506 // Return Types.
22507 EVT Tys[6];
22508 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22509 unsigned n;
22510 for (n = 0; n < NumResultVecs; ++n)
22511 Tys[n] = VecTy;
22512 Tys[n++] = MVT::i64; // Type of write back register
22513 Tys[n] = MVT::Other; // Type of the chain
22514 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22515
22516 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22517 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22518 MemInt->getMemoryVT(),
22519 MemInt->getMemOperand());
22520
22521 // Update the uses.
22522 std::vector<SDValue> NewResults;
22523 for (unsigned i = 0; i < NumResultVecs; ++i) {
22524 NewResults.push_back(SDValue(UpdN.getNode(), i));
22525 }
22526 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22527 DCI.CombineTo(N, NewResults);
22528 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22529
22530 break;
22531 }
22532 return SDValue();
22533}
22534
22535// Checks to see if the value is the prescribed width and returns information
22536// about its extension mode.
22537static
22538bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22539 ExtType = ISD::NON_EXTLOAD;
22540 switch(V.getNode()->getOpcode()) {
22541 default:
22542 return false;
22543 case ISD::LOAD: {
22544 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22545 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22546 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22547 ExtType = LoadNode->getExtensionType();
22548 return true;
22549 }
22550 return false;
22551 }
22552 case ISD::AssertSext: {
22553 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22554 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22555 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22556 ExtType = ISD::SEXTLOAD;
22557 return true;
22558 }
22559 return false;
22560 }
22561 case ISD::AssertZext: {
22562 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22563 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22564 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22565 ExtType = ISD::ZEXTLOAD;
22566 return true;
22567 }
22568 return false;
22569 }
22570 case ISD::Constant:
22571 case ISD::TargetConstant: {
22572 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22573 1LL << (width - 1);
22574 }
22575 }
22576
22577 return true;
22578}
22579
22580// This function does a whole lot of voodoo to determine if the tests are
22581// equivalent without and with a mask. Essentially what happens is that given a
22582// DAG resembling:
22583//
22584// +-------------+ +-------------+ +-------------+ +-------------+
22585// | Input | | AddConstant | | CompConstant| | CC |
22586// +-------------+ +-------------+ +-------------+ +-------------+
22587// | | | |
22588// V V | +----------+
22589// +-------------+ +----+ | |
22590// | ADD | |0xff| | |
22591// +-------------+ +----+ | |
22592// | | | |
22593// V V | |
22594// +-------------+ | |
22595// | AND | | |
22596// +-------------+ | |
22597// | | |
22598// +-----+ | |
22599// | | |
22600// V V V
22601// +-------------+
22602// | CMP |
22603// +-------------+
22604//
22605// The AND node may be safely removed for some combinations of inputs. In
22606// particular we need to take into account the extension type of the Input,
22607// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22608// width of the input (this can work for any width inputs, the above graph is
22609// specific to 8 bits.
22610//
22611// The specific equations were worked out by generating output tables for each
22612// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22613// problem was simplified by working with 4 bit inputs, which means we only
22614// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22615// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22616// patterns present in both extensions (0,7). For every distinct set of
22617// AddConstant and CompConstants bit patterns we can consider the masked and
22618// unmasked versions to be equivalent if the result of this function is true for
22619// all 16 distinct bit patterns of for the current extension type of Input (w0).
22620//
22621// sub w8, w0, w1
22622// and w10, w8, #0x0f
22623// cmp w8, w2
22624// cset w9, AArch64CC
22625// cmp w10, w2
22626// cset w11, AArch64CC
22627// cmp w9, w11
22628// cset w0, eq
22629// ret
22630//
22631// Since the above function shows when the outputs are equivalent it defines
22632// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22633// would be expensive to run during compiles. The equations below were written
22634// in a test harness that confirmed they gave equivalent outputs to the above
22635// for all inputs function, so they can be used determine if the removal is
22636// legal instead.
22637//
22638// isEquivalentMaskless() is the code for testing if the AND can be removed
22639// factored out of the DAG recognition as the DAG can take several forms.
22640
22641static bool isEquivalentMaskless(unsigned CC, unsigned width,
22642 ISD::LoadExtType ExtType, int AddConstant,
22643 int CompConstant) {
22644 // By being careful about our equations and only writing the in term
22645 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22646 // make them generally applicable to all bit widths.
22647 int MaxUInt = (1 << width);
22648
22649 // For the purposes of these comparisons sign extending the type is
22650 // equivalent to zero extending the add and displacing it by half the integer
22651 // width. Provided we are careful and make sure our equations are valid over
22652 // the whole range we can just adjust the input and avoid writing equations
22653 // for sign extended inputs.
22654 if (ExtType == ISD::SEXTLOAD)
22655 AddConstant -= (1 << (width-1));
22656
22657 switch(CC) {
22658 case AArch64CC::LE:
22659 case AArch64CC::GT:
22660 if ((AddConstant == 0) ||
22661 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22662 (AddConstant >= 0 && CompConstant < 0) ||
22663 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22664 return true;
22665 break;
22666 case AArch64CC::LT:
22667 case AArch64CC::GE:
22668 if ((AddConstant == 0) ||
22669 (AddConstant >= 0 && CompConstant <= 0) ||
22670 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22671 return true;
22672 break;
22673 case AArch64CC::HI:
22674 case AArch64CC::LS:
22675 if ((AddConstant >= 0 && CompConstant < 0) ||
22676 (AddConstant <= 0 && CompConstant >= -1 &&
22677 CompConstant < AddConstant + MaxUInt))
22678 return true;
22679 break;
22680 case AArch64CC::PL:
22681 case AArch64CC::MI:
22682 if ((AddConstant == 0) ||
22683 (AddConstant > 0 && CompConstant <= 0) ||
22684 (AddConstant < 0 && CompConstant <= AddConstant))
22685 return true;
22686 break;
22687 case AArch64CC::LO:
22688 case AArch64CC::HS:
22689 if ((AddConstant >= 0 && CompConstant <= 0) ||
22690 (AddConstant <= 0 && CompConstant >= 0 &&
22691 CompConstant <= AddConstant + MaxUInt))
22692 return true;
22693 break;
22694 case AArch64CC::EQ:
22695 case AArch64CC::NE:
22696 if ((AddConstant > 0 && CompConstant < 0) ||
22697 (AddConstant < 0 && CompConstant >= 0 &&
22698 CompConstant < AddConstant + MaxUInt) ||
22699 (AddConstant >= 0 && CompConstant >= 0 &&
22700 CompConstant >= AddConstant) ||
22701 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22702 return true;
22703 break;
22704 case AArch64CC::VS:
22705 case AArch64CC::VC:
22706 case AArch64CC::AL:
22707 case AArch64CC::NV:
22708 return true;
22709 case AArch64CC::Invalid:
22710 break;
22711 }
22712
22713 return false;
22714}
22715
22716// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22717// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22719 SDNode *AndNode, SelectionDAG &DAG,
22720 unsigned CCIndex, unsigned CmpIndex,
22721 unsigned CC) {
22722 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22723 if (!SubsC)
22724 return SDValue();
22725
22726 APInt SubsAP = SubsC->getAPIntValue();
22727 if (CC == AArch64CC::HI) {
22728 if (!SubsAP.isMask())
22729 return SDValue();
22730 } else if (CC == AArch64CC::LO) {
22731 if (!SubsAP.isPowerOf2())
22732 return SDValue();
22733 } else
22734 return SDValue();
22735
22736 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22737 if (!AndC)
22738 return SDValue();
22739
22740 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22741
22742 SDLoc DL(N);
22743 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22744 SDValue ANDS = DAG.getNode(
22745 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22746 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22747 SDValue AArch64_CC =
22749 N->getOperand(CCIndex)->getValueType(0));
22750
22751 // For now, only performCSELCombine and performBRCONDCombine call this
22752 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22753 // operands. So just init the ops direct to simplify the code. If we have some
22754 // other case with different CCIndex, CmpIndex, we need to use for loop to
22755 // rewrite the code here.
22756 // TODO: Do we need to assert number of operand is 4 here?
22757 assert((CCIndex == 2 && CmpIndex == 3) &&
22758 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22759 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22760 ANDS.getValue(1)};
22761 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22762}
22763
22764static
22767 SelectionDAG &DAG, unsigned CCIndex,
22768 unsigned CmpIndex) {
22769 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22770 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22771 unsigned CondOpcode = SubsNode->getOpcode();
22772
22773 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
22774 return SDValue();
22775
22776 // There is a SUBS feeding this condition. Is it fed by a mask we can
22777 // use?
22778
22779 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22780 unsigned MaskBits = 0;
22781
22782 if (AndNode->getOpcode() != ISD::AND)
22783 return SDValue();
22784
22785 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22786 CmpIndex, CC))
22787 return Val;
22788
22789 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22790 uint32_t CNV = CN->getZExtValue();
22791 if (CNV == 255)
22792 MaskBits = 8;
22793 else if (CNV == 65535)
22794 MaskBits = 16;
22795 }
22796
22797 if (!MaskBits)
22798 return SDValue();
22799
22800 SDValue AddValue = AndNode->getOperand(0);
22801
22802 if (AddValue.getOpcode() != ISD::ADD)
22803 return SDValue();
22804
22805 // The basic dag structure is correct, grab the inputs and validate them.
22806
22807 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22808 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22809 SDValue SubsInputValue = SubsNode->getOperand(1);
22810
22811 // The mask is present and the provenance of all the values is a smaller type,
22812 // lets see if the mask is superfluous.
22813
22814 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22815 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22816 return SDValue();
22817
22818 ISD::LoadExtType ExtType;
22819
22820 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22821 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22822 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22823 return SDValue();
22824
22825 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22826 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22827 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22828 return SDValue();
22829
22830 // The AND is not necessary, remove it.
22831
22832 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22833 SubsNode->getValueType(1));
22834 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22835
22836 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22837 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22838
22839 return SDValue(N, 0);
22840}
22841
22842// Optimize compare with zero and branch.
22845 SelectionDAG &DAG) {
22847 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22848 // will not be produced, as they are conditional branch instructions that do
22849 // not set flags.
22850 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22851 return SDValue();
22852
22853 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22854 N = NV.getNode();
22855 SDValue Chain = N->getOperand(0);
22856 SDValue Dest = N->getOperand(1);
22857 SDValue CCVal = N->getOperand(2);
22858 SDValue Cmp = N->getOperand(3);
22859
22860 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22861 unsigned CC = CCVal->getAsZExtVal();
22862 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22863 return SDValue();
22864
22865 unsigned CmpOpc = Cmp.getOpcode();
22866 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22867 return SDValue();
22868
22869 // Only attempt folding if there is only one use of the flag and no use of the
22870 // value.
22871 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22872 return SDValue();
22873
22874 SDValue LHS = Cmp.getOperand(0);
22875 SDValue RHS = Cmp.getOperand(1);
22876
22877 assert(LHS.getValueType() == RHS.getValueType() &&
22878 "Expected the value type to be the same for both operands!");
22879 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22880 return SDValue();
22881
22882 if (isNullConstant(LHS))
22883 std::swap(LHS, RHS);
22884
22885 if (!isNullConstant(RHS))
22886 return SDValue();
22887
22888 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22889 LHS.getOpcode() == ISD::SRL)
22890 return SDValue();
22891
22892 // Fold the compare into the branch instruction.
22893 SDValue BR;
22894 if (CC == AArch64CC::EQ)
22895 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22896 else
22897 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22898
22899 // Do not add new nodes to DAG combiner worklist.
22900 DCI.CombineTo(N, BR, false);
22901
22902 return SDValue();
22903}
22904
22906 unsigned CC = N->getConstantOperandVal(2);
22907 SDValue SUBS = N->getOperand(3);
22908 SDValue Zero, CTTZ;
22909
22910 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22911 Zero = N->getOperand(0);
22912 CTTZ = N->getOperand(1);
22913 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22914 Zero = N->getOperand(1);
22915 CTTZ = N->getOperand(0);
22916 } else
22917 return SDValue();
22918
22919 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22920 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22921 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22922 return SDValue();
22923
22924 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22925 "Illegal type in CTTZ folding");
22926
22927 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
22928 return SDValue();
22929
22930 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22931 ? CTTZ.getOperand(0).getOperand(0)
22932 : CTTZ.getOperand(0);
22933
22934 if (X != SUBS.getOperand(0))
22935 return SDValue();
22936
22937 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22938 ? CTTZ.getOperand(0).getValueSizeInBits()
22939 : CTTZ.getValueSizeInBits();
22940 SDValue BitWidthMinusOne =
22941 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
22942 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
22943 BitWidthMinusOne);
22944}
22945
22946// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
22947// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
22948// Where x and y are constants and x != y
22949
22950// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
22951// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
22952// Where x and y are constants and x != y
22954 SDValue L = Op->getOperand(0);
22955 SDValue R = Op->getOperand(1);
22956 AArch64CC::CondCode OpCC =
22957 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
22958
22959 SDValue OpCmp = Op->getOperand(3);
22960 if (!isCMP(OpCmp))
22961 return SDValue();
22962
22963 SDValue CmpLHS = OpCmp.getOperand(0);
22964 SDValue CmpRHS = OpCmp.getOperand(1);
22965
22966 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
22967 std::swap(CmpLHS, CmpRHS);
22968 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
22969 return SDValue();
22970
22971 SDValue X = CmpLHS->getOperand(0);
22972 SDValue Y = CmpLHS->getOperand(1);
22973 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
22974 return SDValue();
22975 }
22976
22977 // If one of the constant is opaque constant, x,y sdnode is still different
22978 // but the real value maybe the same. So check APInt here to make sure the
22979 // code is correct.
22980 ConstantSDNode *CX = cast<ConstantSDNode>(X);
22981 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
22982 if (CX->getAPIntValue() == CY->getAPIntValue())
22983 return SDValue();
22984
22986 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
22987 SDValue Cond = CmpLHS->getOperand(3);
22988
22989 if (CmpRHS == Y)
22991 else if (CmpRHS != X)
22992 return SDValue();
22993
22994 if (OpCC == AArch64CC::NE)
22996 else if (OpCC != AArch64CC::EQ)
22997 return SDValue();
22998
22999 SDLoc DL(Op);
23000 EVT VT = Op->getValueType(0);
23001
23002 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23003 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23004}
23005
23006// Optimize CSEL instructions
23009 SelectionDAG &DAG) {
23010 // CSEL x, x, cc -> x
23011 if (N->getOperand(0) == N->getOperand(1))
23012 return N->getOperand(0);
23013
23014 if (SDValue R = foldCSELOfCSEL(N, DAG))
23015 return R;
23016
23017 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23018 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23019 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23020 return Folded;
23021
23022 return performCONDCombine(N, DCI, DAG, 2, 3);
23023}
23024
23025// Try to re-use an already extended operand of a vector SetCC feeding a
23026// extended select. Doing so avoids requiring another full extension of the
23027// SET_CC result when lowering the select.
23029 EVT Op0MVT = Op->getOperand(0).getValueType();
23030 if (!Op0MVT.isVector() || Op->use_empty())
23031 return SDValue();
23032
23033 // Make sure that all uses of Op are VSELECTs with result matching types where
23034 // the result type has a larger element type than the SetCC operand.
23035 SDNode *FirstUse = *Op->use_begin();
23036 if (FirstUse->getOpcode() != ISD::VSELECT)
23037 return SDValue();
23038 EVT UseMVT = FirstUse->getValueType(0);
23039 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23040 return SDValue();
23041 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23042 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23043 }))
23044 return SDValue();
23045
23046 APInt V;
23047 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23048 return SDValue();
23049
23050 SDLoc DL(Op);
23051 SDValue Op0ExtV;
23052 SDValue Op1ExtV;
23053 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23054 // Check if the first operand of the SET_CC is already extended. If it is,
23055 // split the SET_CC and re-use the extended version of the operand.
23056 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23057 Op->getOperand(0));
23058 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23059 Op->getOperand(0));
23060 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23061 Op0ExtV = SDValue(Op0SExt, 0);
23062 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23063 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23064 Op0ExtV = SDValue(Op0ZExt, 0);
23065 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23066 } else
23067 return SDValue();
23068
23069 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23070 Op0ExtV, Op1ExtV, Op->getOperand(2));
23071}
23072
23073static SDValue
23075 SelectionDAG &DAG) {
23076 SDValue Vec = N->getOperand(0);
23077 if (DCI.isBeforeLegalize() &&
23078 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23081 SDLoc DL(N);
23082 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23083 DAG);
23084 }
23085
23086 return SDValue();
23087}
23088
23091 SelectionDAG &DAG) {
23092 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23093 SDValue LHS = N->getOperand(0);
23094 SDValue RHS = N->getOperand(1);
23095 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23096 SDLoc DL(N);
23097 EVT VT = N->getValueType(0);
23098
23099 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23100 return V;
23101
23102 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23103 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23104 LHS->getOpcode() == AArch64ISD::CSEL &&
23105 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23106 LHS->hasOneUse()) {
23107 // Invert CSEL's condition.
23108 auto OldCond =
23109 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23110 auto NewCond = getInvertedCondCode(OldCond);
23111
23112 // csel 0, 1, !cond, X
23113 SDValue CSEL =
23114 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23115 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23116 LHS.getOperand(3));
23117 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23118 }
23119
23120 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23121 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23122 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23123 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23124 LHS->hasOneUse()) {
23125 EVT TstVT = LHS->getValueType(0);
23126 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23127 // this pattern will get better opt in emitComparison
23128 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23129 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23130 DAG.getConstant(TstImm, DL, TstVT));
23131 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23132 }
23133 }
23134
23135 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23136 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23137 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23138 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23139 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23140 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23142 LHS->getOpcode() == ISD::BITCAST) {
23143 EVT ToVT = LHS->getValueType(0);
23144 EVT FromVT = LHS->getOperand(0).getValueType();
23145 if (FromVT.isFixedLengthVector() &&
23146 FromVT.getVectorElementType() == MVT::i1) {
23147 bool IsNull = isNullConstant(RHS);
23149 DL, MVT::i1, LHS->getOperand(0));
23150 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23151 LHS);
23152 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23153 }
23154 }
23155
23156 // Try to perform the memcmp when the result is tested for [in]equality with 0
23157 if (SDValue V = performOrXorChainCombine(N, DAG))
23158 return V;
23159
23160 return SDValue();
23161}
23162
23163// Replace a flag-setting operator (eg ANDS) with the generic version
23164// (eg AND) if the flag is unused.
23167 unsigned GenericOpcode) {
23168 SDLoc DL(N);
23169 SDValue LHS = N->getOperand(0);
23170 SDValue RHS = N->getOperand(1);
23171 EVT VT = N->getValueType(0);
23172
23173 // If the flag result isn't used, convert back to a generic opcode.
23174 if (!N->hasAnyUseOfValue(1)) {
23175 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23176 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23177 DL);
23178 }
23179
23180 // Combine identical generic nodes into this node, re-using the result.
23181 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23182 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23183 DCI.CombineTo(Generic, SDValue(N, 0));
23184
23185 return SDValue();
23186}
23187
23189 // setcc_merge_zero pred
23190 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23191 // => extract_subvector (inner setcc_merge_zero)
23192 SDValue Pred = N->getOperand(0);
23193 SDValue LHS = N->getOperand(1);
23194 SDValue RHS = N->getOperand(2);
23195 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23196
23197 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23198 LHS->getOpcode() != ISD::SIGN_EXTEND)
23199 return SDValue();
23200
23201 SDValue Extract = LHS->getOperand(0);
23202 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23203 Extract->getValueType(0) != N->getValueType(0) ||
23204 Extract->getConstantOperandVal(1) != 0)
23205 return SDValue();
23206
23207 SDValue InnerSetCC = Extract->getOperand(0);
23208 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23209 return SDValue();
23210
23211 // By this point we've effectively got
23212 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23213 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23214 // can operate on A directly.
23215 SDValue InnerPred = InnerSetCC.getOperand(0);
23216 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23217 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23218 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23219 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23220 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23221 return Extract;
23222
23223 return SDValue();
23224}
23225
23226static SDValue
23228 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23229 "Unexpected opcode!");
23230
23231 SelectionDAG &DAG = DCI.DAG;
23232 SDValue Pred = N->getOperand(0);
23233 SDValue LHS = N->getOperand(1);
23234 SDValue RHS = N->getOperand(2);
23235 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23236
23237 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23238 return V;
23239
23240 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23241 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23242 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23243 // setcc_merge_zero(
23244 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23245 // => setcc_merge_zero(pred, ...)
23246 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23247 LHS->getOperand(0)->getOperand(0) == Pred)
23248 return LHS->getOperand(0);
23249
23250 // setcc_merge_zero(
23251 // all_active, extend(nxvNi1 ...), != splat(0))
23252 // -> nxvNi1 ...
23253 if (isAllActivePredicate(DAG, Pred))
23254 return LHS->getOperand(0);
23255
23256 // setcc_merge_zero(
23257 // pred, extend(nxvNi1 ...), != splat(0))
23258 // -> nxvNi1 and(pred, ...)
23259 if (DCI.isAfterLegalizeDAG())
23260 // Do this after legalization to allow more folds on setcc_merge_zero
23261 // to be recognized.
23262 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23263 LHS->getOperand(0), Pred);
23264 }
23265
23266 return SDValue();
23267}
23268
23269// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23270// as well as whether the test should be inverted. This code is required to
23271// catch these cases (as opposed to standard dag combines) because
23272// AArch64ISD::TBZ is matched during legalization.
23273static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23274 SelectionDAG &DAG) {
23275
23276 if (!Op->hasOneUse())
23277 return Op;
23278
23279 // We don't handle undef/constant-fold cases below, as they should have
23280 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23281 // etc.)
23282
23283 // (tbz (trunc x), b) -> (tbz x, b)
23284 // This case is just here to enable more of the below cases to be caught.
23285 if (Op->getOpcode() == ISD::TRUNCATE &&
23286 Bit < Op->getValueType(0).getSizeInBits()) {
23287 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23288 }
23289
23290 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23291 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23292 Bit < Op->getOperand(0).getValueSizeInBits()) {
23293 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23294 }
23295
23296 if (Op->getNumOperands() != 2)
23297 return Op;
23298
23299 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23300 if (!C)
23301 return Op;
23302
23303 switch (Op->getOpcode()) {
23304 default:
23305 return Op;
23306
23307 // (tbz (and x, m), b) -> (tbz x, b)
23308 case ISD::AND:
23309 if ((C->getZExtValue() >> Bit) & 1)
23310 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23311 return Op;
23312
23313 // (tbz (shl x, c), b) -> (tbz x, b-c)
23314 case ISD::SHL:
23315 if (C->getZExtValue() <= Bit &&
23316 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23317 Bit = Bit - C->getZExtValue();
23318 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23319 }
23320 return Op;
23321
23322 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23323 case ISD::SRA:
23324 Bit = Bit + C->getZExtValue();
23325 if (Bit >= Op->getValueType(0).getSizeInBits())
23326 Bit = Op->getValueType(0).getSizeInBits() - 1;
23327 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23328
23329 // (tbz (srl x, c), b) -> (tbz x, b+c)
23330 case ISD::SRL:
23331 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23332 Bit = Bit + C->getZExtValue();
23333 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23334 }
23335 return Op;
23336
23337 // (tbz (xor x, -1), b) -> (tbnz x, b)
23338 case ISD::XOR:
23339 if ((C->getZExtValue() >> Bit) & 1)
23340 Invert = !Invert;
23341 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23342 }
23343}
23344
23345// Optimize test single bit zero/non-zero and branch.
23348 SelectionDAG &DAG) {
23349 unsigned Bit = N->getConstantOperandVal(2);
23350 bool Invert = false;
23351 SDValue TestSrc = N->getOperand(1);
23352 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23353
23354 if (TestSrc == NewTestSrc)
23355 return SDValue();
23356
23357 unsigned NewOpc = N->getOpcode();
23358 if (Invert) {
23359 if (NewOpc == AArch64ISD::TBZ)
23360 NewOpc = AArch64ISD::TBNZ;
23361 else {
23362 assert(NewOpc == AArch64ISD::TBNZ);
23363 NewOpc = AArch64ISD::TBZ;
23364 }
23365 }
23366
23367 SDLoc DL(N);
23368 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23369 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23370}
23371
23372// Swap vselect operands where it may allow a predicated operation to achieve
23373// the `sel`.
23374//
23375// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23376// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23378 auto SelectA = N->getOperand(1);
23379 auto SelectB = N->getOperand(2);
23380 auto NTy = N->getValueType(0);
23381
23382 if (!NTy.isScalableVector())
23383 return SDValue();
23384 SDValue SetCC = N->getOperand(0);
23385 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23386 return SDValue();
23387
23388 switch (SelectB.getOpcode()) {
23389 default:
23390 return SDValue();
23391 case ISD::FMUL:
23392 case ISD::FSUB:
23393 case ISD::FADD:
23394 break;
23395 }
23396 if (SelectA != SelectB.getOperand(0))
23397 return SDValue();
23398
23399 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23400 ISD::CondCode InverseCC =
23402 auto InverseSetCC =
23403 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23404 SetCC.getOperand(1), InverseCC);
23405
23406 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23407 {InverseSetCC, SelectB, SelectA});
23408}
23409
23410// vselect (v1i1 setcc) ->
23411// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23412// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23413// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23414// such VSELECT.
23416 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23417 return SwapResult;
23418
23419 SDValue N0 = N->getOperand(0);
23420 EVT CCVT = N0.getValueType();
23421
23422 if (isAllActivePredicate(DAG, N0))
23423 return N->getOperand(1);
23424
23425 if (isAllInactivePredicate(N0))
23426 return N->getOperand(2);
23427
23428 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23429 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23430 // supported types.
23431 SDValue SetCC = N->getOperand(0);
23432 if (SetCC.getOpcode() == ISD::SETCC &&
23433 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23434 SDValue CmpLHS = SetCC.getOperand(0);
23435 EVT VT = CmpLHS.getValueType();
23436 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23437 SDNode *SplatLHS = N->getOperand(1).getNode();
23438 SDNode *SplatRHS = N->getOperand(2).getNode();
23439 APInt SplatLHSVal;
23440 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23441 VT.isSimple() &&
23442 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23443 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23444 VT.getSimpleVT().SimpleTy) &&
23445 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23446 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23448 unsigned NumElts = VT.getVectorNumElements();
23450 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23451 VT.getScalarType()));
23452 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23453
23454 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23455 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23456 return Or;
23457 }
23458 }
23459
23460 EVT CmpVT = N0.getOperand(0).getValueType();
23461 if (N0.getOpcode() != ISD::SETCC ||
23463 CCVT.getVectorElementType() != MVT::i1 ||
23465 return SDValue();
23466
23467 EVT ResVT = N->getValueType(0);
23468 // Only combine when the result type is of the same size as the compared
23469 // operands.
23470 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23471 return SDValue();
23472
23473 SDValue IfTrue = N->getOperand(1);
23474 SDValue IfFalse = N->getOperand(2);
23475 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23476 N0.getOperand(0), N0.getOperand(1),
23477 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23478 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23479 IfTrue, IfFalse);
23480}
23481
23482/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23483/// the compare-mask instructions rather than going via NZCV, even if LHS and
23484/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23485/// with a vector one followed by a DUP shuffle on the result.
23488 SelectionDAG &DAG = DCI.DAG;
23489 SDValue N0 = N->getOperand(0);
23490 EVT ResVT = N->getValueType(0);
23491
23492 if (N0.getOpcode() != ISD::SETCC)
23493 return SDValue();
23494
23495 if (ResVT.isScalableVT())
23496 return SDValue();
23497
23498 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23499 // scalar SetCCResultType. We also don't expect vectors, because we assume
23500 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23501 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23502 "Scalar-SETCC feeding SELECT has unexpected result type!");
23503
23504 // If NumMaskElts == 0, the comparison is larger than select result. The
23505 // largest real NEON comparison is 64-bits per lane, which means the result is
23506 // at most 32-bits and an illegal vector. Just bail out for now.
23507 EVT SrcVT = N0.getOperand(0).getValueType();
23508
23509 // Don't try to do this optimization when the setcc itself has i1 operands.
23510 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23511 // ruled out to prevent the creation of setcc that need to be scalarized.
23512 if (SrcVT == MVT::i1 ||
23513 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23514 return SDValue();
23515
23516 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23517 if (!ResVT.isVector() || NumMaskElts == 0)
23518 return SDValue();
23519
23520 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23522
23523 // Also bail out if the vector CCVT isn't the same size as ResVT.
23524 // This can happen if the SETCC operand size doesn't divide the ResVT size
23525 // (e.g., f64 vs v3f32).
23526 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23527 return SDValue();
23528
23529 // Make sure we didn't create illegal types, if we're not supposed to.
23530 assert(DCI.isBeforeLegalize() ||
23531 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23532
23533 // First perform a vector comparison, where lane 0 is the one we're interested
23534 // in.
23535 SDLoc DL(N0);
23536 SDValue LHS =
23537 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23538 SDValue RHS =
23539 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23540 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23541
23542 // Now duplicate the comparison mask we want across all other lanes.
23543 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23544 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23545 Mask = DAG.getNode(ISD::BITCAST, DL,
23546 ResVT.changeVectorElementTypeToInteger(), Mask);
23547
23548 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23549}
23550
23553 EVT VT = N->getValueType(0);
23554 SDLoc DL(N);
23555 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23556 // 128bit vector version.
23557 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23559 SmallVector<SDValue> Ops(N->ops());
23560 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23561 DCI.DAG.getVTList(LVT), Ops)) {
23562 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23563 DCI.DAG.getConstant(0, DL, MVT::i64));
23564 }
23565 }
23566
23567 if (N->getOpcode() == AArch64ISD::DUP) {
23568 if (DCI.isAfterLegalizeDAG()) {
23569 // If scalar dup's operand is extract_vector_elt, try to combine them into
23570 // duplane. For example,
23571 //
23572 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23573 // t18: v4i32 = AArch64ISD::DUP t21
23574 // ==>
23575 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23576 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23577 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23578 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23579 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23580 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23581 EXTRACT_VEC_ELT.getOperand(1));
23582 }
23583 }
23584 }
23585
23586 return performPostLD1Combine(N, DCI, false);
23587 }
23588
23589 return SDValue();
23590}
23591
23592/// Get rid of unnecessary NVCASTs (that don't change the type).
23594 if (N->getValueType(0) == N->getOperand(0).getValueType())
23595 return N->getOperand(0);
23596 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23597 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23598 N->getOperand(0).getOperand(0));
23599
23600 return SDValue();
23601}
23602
23603// If all users of the globaladdr are of the form (globaladdr + constant), find
23604// the smallest constant, fold it into the globaladdr's offset and rewrite the
23605// globaladdr as (globaladdr + constant) - constant.
23607 const AArch64Subtarget *Subtarget,
23608 const TargetMachine &TM) {
23609 auto *GN = cast<GlobalAddressSDNode>(N);
23610 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23612 return SDValue();
23613
23614 uint64_t MinOffset = -1ull;
23615 for (SDNode *N : GN->uses()) {
23616 if (N->getOpcode() != ISD::ADD)
23617 return SDValue();
23618 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23619 if (!C)
23620 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23621 if (!C)
23622 return SDValue();
23623 MinOffset = std::min(MinOffset, C->getZExtValue());
23624 }
23625 uint64_t Offset = MinOffset + GN->getOffset();
23626
23627 // Require that the new offset is larger than the existing one. Otherwise, we
23628 // can end up oscillating between two possible DAGs, for example,
23629 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23630 if (Offset <= uint64_t(GN->getOffset()))
23631 return SDValue();
23632
23633 // Check whether folding this offset is legal. It must not go out of bounds of
23634 // the referenced object to avoid violating the code model, and must be
23635 // smaller than 2^20 because this is the largest offset expressible in all
23636 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23637 // stores an immediate signed 21 bit offset.)
23638 //
23639 // This check also prevents us from folding negative offsets, which will end
23640 // up being treated in the same way as large positive ones. They could also
23641 // cause code model violations, and aren't really common enough to matter.
23642 if (Offset >= (1 << 20))
23643 return SDValue();
23644
23645 const GlobalValue *GV = GN->getGlobal();
23646 Type *T = GV->getValueType();
23647 if (!T->isSized() ||
23649 return SDValue();
23650
23651 SDLoc DL(GN);
23652 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23653 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23654 DAG.getConstant(MinOffset, DL, MVT::i64));
23655}
23656
23658 const AArch64Subtarget *Subtarget) {
23659 SDValue BR = N->getOperand(0);
23660 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23661 !BR.getValueType().isScalarInteger())
23662 return SDValue();
23663
23664 SDLoc DL(N);
23665 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23666}
23667
23668// Turns the vector of indices into a vector of byte offstes by scaling Offset
23669// by (BitWidth / 8).
23671 SDLoc DL, unsigned BitWidth) {
23672 assert(Offset.getValueType().isScalableVector() &&
23673 "This method is only for scalable vectors of offsets");
23674
23675 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23676 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23677
23678 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23679}
23680
23681/// Check if the value of \p OffsetInBytes can be used as an immediate for
23682/// the gather load/prefetch and scatter store instructions with vector base and
23683/// immediate offset addressing mode:
23684///
23685/// [<Zn>.[S|D]{, #<imm>}]
23686///
23687/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23688inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23689 unsigned ScalarSizeInBytes) {
23690 // The immediate is not a multiple of the scalar size.
23691 if (OffsetInBytes % ScalarSizeInBytes)
23692 return false;
23693
23694 // The immediate is out of range.
23695 if (OffsetInBytes / ScalarSizeInBytes > 31)
23696 return false;
23697
23698 return true;
23699}
23700
23701/// Check if the value of \p Offset represents a valid immediate for the SVE
23702/// gather load/prefetch and scatter store instructiona with vector base and
23703/// immediate offset addressing mode:
23704///
23705/// [<Zn>.[S|D]{, #<imm>}]
23706///
23707/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23709 unsigned ScalarSizeInBytes) {
23710 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23711 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23712 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23713}
23714
23716 unsigned Opcode,
23717 bool OnlyPackedOffsets = true) {
23718 const SDValue Src = N->getOperand(2);
23719 const EVT SrcVT = Src->getValueType(0);
23720 assert(SrcVT.isScalableVector() &&
23721 "Scatter stores are only possible for SVE vectors");
23722
23723 SDLoc DL(N);
23724 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23725
23726 // Make sure that source data will fit into an SVE register
23728 return SDValue();
23729
23730 // For FPs, ACLE only supports _packed_ single and double precision types.
23731 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23732 if (SrcElVT.isFloatingPoint())
23733 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23734 ((Opcode != AArch64ISD::SST1Q_PRED &&
23735 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23736 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23737 return SDValue();
23738
23739 // Depending on the addressing mode, this is either a pointer or a vector of
23740 // pointers (that fits into one register)
23741 SDValue Base = N->getOperand(4);
23742 // Depending on the addressing mode, this is either a single offset or a
23743 // vector of offsets (that fits into one register)
23744 SDValue Offset = N->getOperand(5);
23745
23746 // For "scalar + vector of indices", just scale the indices. This only
23747 // applies to non-temporal scatters because there's no instruction that takes
23748 // indicies.
23749 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23750 Offset =
23752 Opcode = AArch64ISD::SSTNT1_PRED;
23753 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23754 Offset =
23756 Opcode = AArch64ISD::SST1Q_PRED;
23757 }
23758
23759 // In the case of non-temporal gather loads there's only one SVE instruction
23760 // per data-size: "scalar + vector", i.e.
23761 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23762 // Since we do have intrinsics that allow the arguments to be in a different
23763 // order, we may need to swap them to match the spec.
23764 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23765 Offset.getValueType().isVector())
23767
23768 // SST1_IMM requires that the offset is an immediate that is:
23769 // * a multiple of #SizeInBytes,
23770 // * in the range [0, 31 x #SizeInBytes],
23771 // where #SizeInBytes is the size in bytes of the stored items. For
23772 // immediates outside that range and non-immediate scalar offsets use SST1 or
23773 // SST1_UXTW instead.
23774 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23776 SrcVT.getScalarSizeInBits() / 8)) {
23777 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23779 else
23780 Opcode = AArch64ISD::SST1_PRED;
23781
23783 }
23784 }
23785
23786 auto &TLI = DAG.getTargetLoweringInfo();
23787 if (!TLI.isTypeLegal(Base.getValueType()))
23788 return SDValue();
23789
23790 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23791 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23792 // nxv2i64. Legalize accordingly.
23793 if (!OnlyPackedOffsets &&
23794 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23795 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23796
23797 if (!TLI.isTypeLegal(Offset.getValueType()))
23798 return SDValue();
23799
23800 // Source value type that is representable in hardware
23801 EVT HwSrcVt = getSVEContainerType(SrcVT);
23802
23803 // Keep the original type of the input data to store - this is needed to be
23804 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23805 // FP values we want the integer equivalent, so just use HwSrcVt.
23806 SDValue InputVT = DAG.getValueType(SrcVT);
23807 if (SrcVT.isFloatingPoint())
23808 InputVT = DAG.getValueType(HwSrcVt);
23809
23810 SDVTList VTs = DAG.getVTList(MVT::Other);
23811 SDValue SrcNew;
23812
23813 if (Src.getValueType().isFloatingPoint())
23814 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23815 else
23816 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23817
23818 SDValue Ops[] = {N->getOperand(0), // Chain
23819 SrcNew,
23820 N->getOperand(3), // Pg
23821 Base,
23822 Offset,
23823 InputVT};
23824
23825 return DAG.getNode(Opcode, DL, VTs, Ops);
23826}
23827
23829 unsigned Opcode,
23830 bool OnlyPackedOffsets = true) {
23831 const EVT RetVT = N->getValueType(0);
23832 assert(RetVT.isScalableVector() &&
23833 "Gather loads are only possible for SVE vectors");
23834
23835 SDLoc DL(N);
23836
23837 // Make sure that the loaded data will fit into an SVE register
23839 return SDValue();
23840
23841 // Depending on the addressing mode, this is either a pointer or a vector of
23842 // pointers (that fits into one register)
23843 SDValue Base = N->getOperand(3);
23844 // Depending on the addressing mode, this is either a single offset or a
23845 // vector of offsets (that fits into one register)
23846 SDValue Offset = N->getOperand(4);
23847
23848 // For "scalar + vector of indices", scale the indices to obtain unscaled
23849 // offsets. This applies to non-temporal and quadword gathers, which do not
23850 // have an addressing mode with scaled offset.
23853 RetVT.getScalarSizeInBits());
23855 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23857 RetVT.getScalarSizeInBits());
23859 }
23860
23861 // In the case of non-temporal gather loads and quadword gather loads there's
23862 // only one addressing mode : "vector + scalar", e.g.
23863 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23864 // Since we do have intrinsics that allow the arguments to be in a different
23865 // order, we may need to swap them to match the spec.
23866 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23867 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23868 Offset.getValueType().isVector())
23870
23871 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23872 // * a multiple of #SizeInBytes,
23873 // * in the range [0, 31 x #SizeInBytes],
23874 // where #SizeInBytes is the size in bytes of the loaded items. For
23875 // immediates outside that range and non-immediate scalar offsets use
23876 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23877 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23880 RetVT.getScalarSizeInBits() / 8)) {
23881 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23882 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23885 else
23886 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23889
23891 }
23892 }
23893
23894 auto &TLI = DAG.getTargetLoweringInfo();
23895 if (!TLI.isTypeLegal(Base.getValueType()))
23896 return SDValue();
23897
23898 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23899 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23900 // nxv2i64. Legalize accordingly.
23901 if (!OnlyPackedOffsets &&
23902 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23903 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23904
23905 // Return value type that is representable in hardware
23906 EVT HwRetVt = getSVEContainerType(RetVT);
23907
23908 // Keep the original output value type around - this is needed to be able to
23909 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23910 // values we want the integer equivalent, so just use HwRetVT.
23911 SDValue OutVT = DAG.getValueType(RetVT);
23912 if (RetVT.isFloatingPoint())
23913 OutVT = DAG.getValueType(HwRetVt);
23914
23915 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23916 SDValue Ops[] = {N->getOperand(0), // Chain
23917 N->getOperand(2), // Pg
23918 Base, Offset, OutVT};
23919
23920 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23921 SDValue LoadChain = SDValue(Load.getNode(), 1);
23922
23923 if (RetVT.isInteger() && (RetVT != HwRetVt))
23924 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
23925
23926 // If the original return value was FP, bitcast accordingly. Doing it here
23927 // means that we can avoid adding TableGen patterns for FPs.
23928 if (RetVT.isFloatingPoint())
23929 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
23930
23931 return DAG.getMergeValues({Load, LoadChain}, DL);
23932}
23933
23934static SDValue
23936 SelectionDAG &DAG) {
23937 SDLoc DL(N);
23938 SDValue Src = N->getOperand(0);
23939 unsigned Opc = Src->getOpcode();
23940
23941 // Sign extend of an unsigned unpack -> signed unpack
23942 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
23943
23944 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
23946
23947 // Push the sign extend to the operand of the unpack
23948 // This is necessary where, for example, the operand of the unpack
23949 // is another unpack:
23950 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
23951 // ->
23952 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
23953 // ->
23954 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
23955 SDValue ExtOp = Src->getOperand(0);
23956 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
23957 EVT EltTy = VT.getVectorElementType();
23958 (void)EltTy;
23959
23960 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
23961 "Sign extending from an invalid type");
23962
23963 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
23964
23966 ExtOp, DAG.getValueType(ExtVT));
23967
23968 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
23969 }
23970
23971 if (DCI.isBeforeLegalizeOps())
23972 return SDValue();
23973
23975 return SDValue();
23976
23977 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
23978 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
23979 unsigned NewOpc;
23980 unsigned MemVTOpNum = 4;
23981 switch (Opc) {
23984 MemVTOpNum = 3;
23985 break;
23988 MemVTOpNum = 3;
23989 break;
23992 MemVTOpNum = 3;
23993 break;
23996 break;
23999 break;
24002 break;
24005 break;
24008 break;
24011 break;
24014 break;
24017 break;
24020 break;
24023 break;
24026 break;
24029 break;
24032 break;
24035 break;
24038 break;
24039 default:
24040 return SDValue();
24041 }
24042
24043 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24044 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24045
24046 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24047 return SDValue();
24048
24049 EVT DstVT = N->getValueType(0);
24050 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24051
24053 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24054 Ops.push_back(Src->getOperand(I));
24055
24056 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24057 DCI.CombineTo(N, ExtLoad);
24058 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24059
24060 // Return N so it doesn't get rechecked
24061 return SDValue(N, 0);
24062}
24063
24064/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24065/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24066/// != nxv2i32) do not need legalization.
24068 const unsigned OffsetPos = 4;
24069 SDValue Offset = N->getOperand(OffsetPos);
24070
24071 // Not an unpacked vector, bail out.
24072 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24073 return SDValue();
24074
24075 // Extend the unpacked offset vector to 64-bit lanes.
24076 SDLoc DL(N);
24077 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24078 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24079 // Replace the offset operand with the 64-bit one.
24080 Ops[OffsetPos] = Offset;
24081
24082 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24083}
24084
24085/// Combines a node carrying the intrinsic
24086/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24087/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24088/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24089/// sve gather prefetch instruction with vector plus immediate addressing mode.
24091 unsigned ScalarSizeInBytes) {
24092 const unsigned ImmPos = 4, OffsetPos = 3;
24093 // No need to combine the node if the immediate is valid...
24094 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24095 return SDValue();
24096
24097 // ...otherwise swap the offset base with the offset...
24098 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24099 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24100 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24101 // `aarch64_sve_prfb_gather_uxtw_index`.
24102 SDLoc DL(N);
24103 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24104 MVT::i64);
24105
24106 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24107}
24108
24109// Return true if the vector operation can guarantee only the first lane of its
24110// result contains data, with all bits in other lanes set to zero.
24112 switch (Op.getOpcode()) {
24113 default:
24114 return false;
24130 return true;
24131 }
24132}
24133
24135 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24136 SDValue InsertVec = N->getOperand(0);
24137 SDValue InsertElt = N->getOperand(1);
24138 SDValue InsertIdx = N->getOperand(2);
24139
24140 // We only care about inserts into the first element...
24141 if (!isNullConstant(InsertIdx))
24142 return SDValue();
24143 // ...of a zero'd vector...
24145 return SDValue();
24146 // ...where the inserted data was previously extracted...
24147 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24148 return SDValue();
24149
24150 SDValue ExtractVec = InsertElt.getOperand(0);
24151 SDValue ExtractIdx = InsertElt.getOperand(1);
24152
24153 // ...from the first element of a vector.
24154 if (!isNullConstant(ExtractIdx))
24155 return SDValue();
24156
24157 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24158
24159 // Ensure there's no type conversion going on.
24160 if (N->getValueType(0) != ExtractVec.getValueType())
24161 return SDValue();
24162
24163 if (!isLanes1toNKnownZero(ExtractVec))
24164 return SDValue();
24165
24166 // The explicit zeroing is redundant.
24167 return ExtractVec;
24168}
24169
24170static SDValue
24173 return Res;
24174
24175 return performPostLD1Combine(N, DCI, true);
24176}
24177
24179 EVT Ty = N->getValueType(0);
24180 if (Ty.isInteger())
24181 return SDValue();
24182
24185 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24187 return SDValue();
24188
24189 SDLoc DL(N);
24190 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
24191 DL, ExtIntTy);
24192 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
24193 DL, ExtIntTy);
24194 SDValue Idx = N->getOperand(2);
24195 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
24196 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
24197 return DAG.getBitcast(Ty, Trunc);
24198}
24199
24202 const AArch64Subtarget *Subtarget) {
24203 SDValue N0 = N->getOperand(0);
24204 EVT VT = N->getValueType(0);
24205
24206 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24207 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24208 return SDValue();
24209
24210 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24211 EVT EltVT = VT.getVectorElementType();
24212 return EltVT == MVT::f32 || EltVT == MVT::f64;
24213 };
24214
24215 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24216 // We purposefully don't care about legality of the nodes here as we know
24217 // they can be split down into something legal.
24218 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24219 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24220 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24221 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24222 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24223 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24224 LN0->getChain(), LN0->getBasePtr(),
24225 N0.getValueType(), LN0->getMemOperand());
24226 DCI.CombineTo(N, ExtLoad);
24227 DCI.CombineTo(
24228 N0.getNode(),
24229 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24230 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24231 ExtLoad.getValue(1));
24232 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24233 }
24234
24235 return SDValue();
24236}
24237
24239 const AArch64Subtarget *Subtarget) {
24240 EVT VT = N->getValueType(0);
24241
24242 // Don't expand for NEON, SVE2 or SME
24243 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24244 return SDValue();
24245
24246 SDLoc DL(N);
24247
24248 SDValue Mask = N->getOperand(0);
24249 SDValue In1 = N->getOperand(1);
24250 SDValue In2 = N->getOperand(2);
24251
24252 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24253 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24254 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24255 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24256}
24257
24259 EVT VT = N->getValueType(0);
24260
24261 SDValue Insert = N->getOperand(0);
24262 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24263 return SDValue();
24264
24265 if (!Insert.getOperand(0).isUndef())
24266 return SDValue();
24267
24268 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24269 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24270 if (IdxInsert != 0 || IdxDupLane != 0)
24271 return SDValue();
24272
24273 SDValue Bitcast = Insert.getOperand(1);
24274 if (Bitcast.getOpcode() != ISD::BITCAST)
24275 return SDValue();
24276
24277 SDValue Subvec = Bitcast.getOperand(0);
24278 EVT SubvecVT = Subvec.getValueType();
24279 if (!SubvecVT.is128BitVector())
24280 return SDValue();
24281 EVT NewSubvecVT =
24283
24284 SDLoc DL(N);
24285 SDValue NewInsert =
24286 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24287 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24288 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24289 NewInsert, N->getOperand(1));
24290 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24291}
24292
24293// Try to combine mull with uzp1.
24296 SelectionDAG &DAG) {
24297 if (DCI.isBeforeLegalizeOps())
24298 return SDValue();
24299
24300 SDValue LHS = N->getOperand(0);
24301 SDValue RHS = N->getOperand(1);
24302
24303 SDValue ExtractHigh;
24304 SDValue ExtractLow;
24305 SDValue TruncHigh;
24306 SDValue TruncLow;
24307 SDLoc DL(N);
24308
24309 // Check the operands are trunc and extract_high.
24311 RHS.getOpcode() == ISD::TRUNCATE) {
24312 TruncHigh = RHS;
24313 if (LHS.getOpcode() == ISD::BITCAST)
24314 ExtractHigh = LHS.getOperand(0);
24315 else
24316 ExtractHigh = LHS;
24318 LHS.getOpcode() == ISD::TRUNCATE) {
24319 TruncHigh = LHS;
24320 if (LHS.getOpcode() == ISD::BITCAST)
24321 ExtractHigh = RHS.getOperand(0);
24322 else
24323 ExtractHigh = RHS;
24324 } else
24325 return SDValue();
24326
24327 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24328 // with uzp1.
24329 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24330 SDValue TruncHighOp = TruncHigh.getOperand(0);
24331 EVT TruncHighOpVT = TruncHighOp.getValueType();
24332 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24333 DAG.isSplatValue(TruncHighOp, false))
24334 return SDValue();
24335
24336 // Check there is other extract_high with same source vector.
24337 // For example,
24338 //
24339 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24340 // t12: v4i16 = truncate t11
24341 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24342 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24343 // t16: v4i16 = truncate t15
24344 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24345 //
24346 // This dagcombine assumes the two extract_high uses same source vector in
24347 // order to detect the pair of the mull. If they have different source vector,
24348 // this code will not work.
24349 bool HasFoundMULLow = true;
24350 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24351 if (ExtractHighSrcVec->use_size() != 2)
24352 HasFoundMULLow = false;
24353
24354 // Find ExtractLow.
24355 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24356 if (User == ExtractHigh.getNode())
24357 continue;
24358
24359 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24361 HasFoundMULLow = false;
24362 break;
24363 }
24364
24365 ExtractLow.setNode(User);
24366 }
24367
24368 if (!ExtractLow || !ExtractLow->hasOneUse())
24369 HasFoundMULLow = false;
24370
24371 // Check ExtractLow's user.
24372 if (HasFoundMULLow) {
24373 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24374 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24375 HasFoundMULLow = false;
24376 } else {
24377 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24378 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24379 TruncLow = ExtractLowUser->getOperand(1);
24380 else
24381 HasFoundMULLow = false;
24382 } else {
24383 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24384 TruncLow = ExtractLowUser->getOperand(0);
24385 else
24386 HasFoundMULLow = false;
24387 }
24388 }
24389 }
24390
24391 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24392 // with uzp1.
24393 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24394 EVT TruncHighVT = TruncHigh.getValueType();
24395 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24396 SDValue TruncLowOp =
24397 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24398 EVT TruncLowOpVT = TruncLowOp.getValueType();
24399 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24400 DAG.isSplatValue(TruncLowOp, false)))
24401 return SDValue();
24402
24403 // Create uzp1, extract_high and extract_low.
24404 if (TruncHighOpVT != UZP1VT)
24405 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24406 if (TruncLowOpVT != UZP1VT)
24407 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24408
24409 SDValue UZP1 =
24410 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24411 SDValue HighIdxCst =
24412 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24413 SDValue NewTruncHigh =
24414 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24415 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24416
24417 if (HasFoundMULLow) {
24418 EVT TruncLowVT = TruncLow.getValueType();
24419 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24420 UZP1, ExtractLow.getOperand(1));
24421 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24422 }
24423
24424 return SDValue(N, 0);
24425}
24426
24429 SelectionDAG &DAG) {
24430 if (SDValue Val =
24432 return Val;
24433
24434 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24435 return Val;
24436
24437 return SDValue();
24438}
24439
24440static SDValue
24442 SelectionDAG &DAG) {
24443 // Let's do below transform.
24444 //
24445 // t34: v4i32 = AArch64ISD::UADDLV t2
24446 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24447 // t7: i64 = zero_extend t35
24448 // t20: v1i64 = scalar_to_vector t7
24449 // ==>
24450 // t34: v4i32 = AArch64ISD::UADDLV t2
24451 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24452 // t40: v1i64 = AArch64ISD::NVCAST t39
24453 if (DCI.isBeforeLegalizeOps())
24454 return SDValue();
24455
24456 EVT VT = N->getValueType(0);
24457 if (VT != MVT::v1i64)
24458 return SDValue();
24459
24460 SDValue ZEXT = N->getOperand(0);
24461 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24462 return SDValue();
24463
24464 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24465 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24466 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24467 return SDValue();
24468
24469 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24470 return SDValue();
24471
24472 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24473 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24474 UADDLV.getValueType() != MVT::v4i32 ||
24475 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24476 return SDValue();
24477
24478 // Let's generate new sequence with AArch64ISD::NVCAST.
24479 SDLoc DL(N);
24480 SDValue EXTRACT_SUBVEC =
24481 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24482 DAG.getConstant(0, DL, MVT::i64));
24483 SDValue NVCAST =
24484 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24485
24486 return NVCAST;
24487}
24488
24490 DAGCombinerInfo &DCI) const {
24491 SelectionDAG &DAG = DCI.DAG;
24492 switch (N->getOpcode()) {
24493 default:
24494 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24495 break;
24496 case ISD::VECREDUCE_AND:
24497 case ISD::VECREDUCE_OR:
24498 case ISD::VECREDUCE_XOR:
24499 return performVecReduceBitwiseCombine(N, DCI, DAG);
24500 case ISD::ADD:
24501 case ISD::SUB:
24502 return performAddSubCombine(N, DCI);
24503 case ISD::BUILD_VECTOR:
24504 return performBuildVectorCombine(N, DCI, DAG);
24505 case ISD::TRUNCATE:
24506 return performTruncateCombine(N, DAG);
24507 case AArch64ISD::ANDS:
24508 return performFlagSettingCombine(N, DCI, ISD::AND);
24509 case AArch64ISD::ADC:
24510 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24511 return R;
24512 return foldADCToCINC(N, DAG);
24513 case AArch64ISD::SBC:
24514 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24515 case AArch64ISD::ADCS:
24516 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24517 return R;
24519 case AArch64ISD::SBCS:
24520 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24521 return R;
24523 case ISD::XOR:
24524 return performXorCombine(N, DAG, DCI, Subtarget);
24525 case ISD::MUL:
24526 return performMulCombine(N, DAG, DCI, Subtarget);
24527 case ISD::SINT_TO_FP:
24528 case ISD::UINT_TO_FP:
24529 return performIntToFpCombine(N, DAG, Subtarget);
24530 case ISD::FP_TO_SINT:
24531 case ISD::FP_TO_UINT:
24534 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24535 case ISD::FDIV:
24536 return performFDivCombine(N, DAG, DCI, Subtarget);
24537 case ISD::OR:
24538 return performORCombine(N, DCI, Subtarget, *this);
24539 case ISD::AND:
24540 return performANDCombine(N, DCI);
24541 case ISD::FADD:
24542 return performFADDCombine(N, DCI);
24544 return performIntrinsicCombine(N, DCI, Subtarget);
24545 case ISD::ANY_EXTEND:
24546 case ISD::ZERO_EXTEND:
24547 case ISD::SIGN_EXTEND:
24548 return performExtendCombine(N, DCI, DAG);
24550 return performSignExtendInRegCombine(N, DCI, DAG);
24552 return performConcatVectorsCombine(N, DCI, DAG);
24554 return performExtractSubvectorCombine(N, DCI, DAG);
24556 return performInsertSubvectorCombine(N, DCI, DAG);
24557 case ISD::SELECT:
24558 return performSelectCombine(N, DCI);
24559 case ISD::VSELECT:
24560 return performVSelectCombine(N, DCI.DAG);
24561 case ISD::SETCC:
24562 return performSETCCCombine(N, DCI, DAG);
24563 case ISD::LOAD:
24564 return performLOADCombine(N, DCI, DAG, Subtarget);
24565 case ISD::STORE:
24566 return performSTORECombine(N, DCI, DAG, Subtarget);
24567 case ISD::MSTORE:
24568 return performMSTORECombine(N, DCI, DAG, Subtarget);
24569 case ISD::MGATHER:
24570 case ISD::MSCATTER:
24571 return performMaskedGatherScatterCombine(N, DCI, DAG);
24572 case ISD::VECTOR_SPLICE:
24573 return performSVESpliceCombine(N, DAG);
24574 case ISD::FP_EXTEND:
24575 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24576 case AArch64ISD::BRCOND:
24577 return performBRCONDCombine(N, DCI, DAG);
24578 case AArch64ISD::TBNZ:
24579 case AArch64ISD::TBZ:
24580 return performTBZCombine(N, DCI, DAG);
24581 case AArch64ISD::CSEL:
24582 return performCSELCombine(N, DCI, DAG);
24583 case AArch64ISD::DUP:
24588 return performDUPCombine(N, DCI);
24590 return performDupLane128Combine(N, DAG);
24591 case AArch64ISD::NVCAST:
24592 return performNVCASTCombine(N, DAG);
24593 case AArch64ISD::SPLICE:
24594 return performSpliceCombine(N, DAG);
24597 return performUnpackCombine(N, DAG, Subtarget);
24598 case AArch64ISD::UZP1:
24599 return performUzpCombine(N, DAG, Subtarget);
24601 return performSetccMergeZeroCombine(N, DCI);
24618 return performGLD1Combine(N, DAG);
24619 case AArch64ISD::VASHR:
24620 case AArch64ISD::VLSHR:
24621 return performVectorShiftCombine(N, *this, DCI);
24623 return performSunpkloCombine(N, DAG);
24624 case AArch64ISD::BSP:
24625 return performBSPExpandForSVE(N, DAG, Subtarget);
24627 return performInsertVectorEltCombine(N, DCI);
24629 return performExtractVectorEltCombine(N, DCI, Subtarget);
24630 case ISD::VECREDUCE_ADD:
24631 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24632 case AArch64ISD::UADDV:
24633 return performUADDVCombine(N, DAG);
24634 case AArch64ISD::SMULL:
24635 case AArch64ISD::UMULL:
24636 case AArch64ISD::PMULL:
24637 return performMULLCombine(N, DCI, DAG);
24640 switch (N->getConstantOperandVal(1)) {
24641 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24642 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24643 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24644 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24645 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24646 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24647 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24648 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24649 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24650 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24651 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24652 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24653 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24654 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24655 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24656 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24658 case Intrinsic::aarch64_neon_ld2:
24659 case Intrinsic::aarch64_neon_ld3:
24660 case Intrinsic::aarch64_neon_ld4:
24661 case Intrinsic::aarch64_neon_ld1x2:
24662 case Intrinsic::aarch64_neon_ld1x3:
24663 case Intrinsic::aarch64_neon_ld1x4:
24664 case Intrinsic::aarch64_neon_ld2lane:
24665 case Intrinsic::aarch64_neon_ld3lane:
24666 case Intrinsic::aarch64_neon_ld4lane:
24667 case Intrinsic::aarch64_neon_ld2r:
24668 case Intrinsic::aarch64_neon_ld3r:
24669 case Intrinsic::aarch64_neon_ld4r:
24670 case Intrinsic::aarch64_neon_st2:
24671 case Intrinsic::aarch64_neon_st3:
24672 case Intrinsic::aarch64_neon_st4:
24673 case Intrinsic::aarch64_neon_st1x2:
24674 case Intrinsic::aarch64_neon_st1x3:
24675 case Intrinsic::aarch64_neon_st1x4:
24676 case Intrinsic::aarch64_neon_st2lane:
24677 case Intrinsic::aarch64_neon_st3lane:
24678 case Intrinsic::aarch64_neon_st4lane:
24679 return performNEONPostLDSTCombine(N, DCI, DAG);
24680 case Intrinsic::aarch64_sve_ldnt1:
24681 return performLDNT1Combine(N, DAG);
24682 case Intrinsic::aarch64_sve_ld1rq:
24683 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24684 case Intrinsic::aarch64_sve_ld1ro:
24685 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24686 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24688 case Intrinsic::aarch64_sve_ldnt1_gather:
24690 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24691 return performGatherLoadCombine(N, DAG,
24693 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24695 case Intrinsic::aarch64_sve_ld1:
24697 case Intrinsic::aarch64_sve_ldnf1:
24699 case Intrinsic::aarch64_sve_ldff1:
24701 case Intrinsic::aarch64_sve_st1:
24702 return performST1Combine(N, DAG);
24703 case Intrinsic::aarch64_sve_stnt1:
24704 return performSTNT1Combine(N, DAG);
24705 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24707 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24709 case Intrinsic::aarch64_sve_stnt1_scatter:
24711 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24713 case Intrinsic::aarch64_sve_ld1_gather:
24715 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24716 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24718 case Intrinsic::aarch64_sve_ld1q_gather_index:
24719 return performGatherLoadCombine(N, DAG,
24721 case Intrinsic::aarch64_sve_ld1_gather_index:
24722 return performGatherLoadCombine(N, DAG,
24724 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24726 /*OnlyPackedOffsets=*/false);
24727 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24729 /*OnlyPackedOffsets=*/false);
24730 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24731 return performGatherLoadCombine(N, DAG,
24733 /*OnlyPackedOffsets=*/false);
24734 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24735 return performGatherLoadCombine(N, DAG,
24737 /*OnlyPackedOffsets=*/false);
24738 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24740 case Intrinsic::aarch64_sve_ldff1_gather:
24742 case Intrinsic::aarch64_sve_ldff1_gather_index:
24743 return performGatherLoadCombine(N, DAG,
24745 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24746 return performGatherLoadCombine(N, DAG,
24748 /*OnlyPackedOffsets=*/false);
24749 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24750 return performGatherLoadCombine(N, DAG,
24752 /*OnlyPackedOffsets=*/false);
24753 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24754 return performGatherLoadCombine(N, DAG,
24756 /*OnlyPackedOffsets=*/false);
24757 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24758 return performGatherLoadCombine(N, DAG,
24760 /*OnlyPackedOffsets=*/false);
24761 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24762 return performGatherLoadCombine(N, DAG,
24764 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24765 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24767 case Intrinsic::aarch64_sve_st1q_scatter_index:
24769 case Intrinsic::aarch64_sve_st1_scatter:
24771 case Intrinsic::aarch64_sve_st1_scatter_index:
24773 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24775 /*OnlyPackedOffsets=*/false);
24776 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24778 /*OnlyPackedOffsets=*/false);
24779 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24780 return performScatterStoreCombine(N, DAG,
24782 /*OnlyPackedOffsets=*/false);
24783 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24784 return performScatterStoreCombine(N, DAG,
24786 /*OnlyPackedOffsets=*/false);
24787 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24789 case Intrinsic::aarch64_rndr:
24790 case Intrinsic::aarch64_rndrrs: {
24791 unsigned IntrinsicID = N->getConstantOperandVal(1);
24792 auto Register =
24793 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24794 : AArch64SysReg::RNDRRS);
24795 SDLoc DL(N);
24796 SDValue A = DAG.getNode(
24797 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24798 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24799 SDValue B = DAG.getNode(
24800 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24801 DAG.getConstant(0, DL, MVT::i32),
24802 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24803 return DAG.getMergeValues(
24804 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24805 }
24806 case Intrinsic::aarch64_sme_ldr_zt:
24808 DAG.getVTList(MVT::Other), N->getOperand(0),
24809 N->getOperand(2), N->getOperand(3));
24810 case Intrinsic::aarch64_sme_str_zt:
24811 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24812 DAG.getVTList(MVT::Other), N->getOperand(0),
24813 N->getOperand(2), N->getOperand(3));
24814 default:
24815 break;
24816 }
24817 break;
24818 case ISD::GlobalAddress:
24819 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24820 case ISD::CTLZ:
24821 return performCTLZCombine(N, DAG, Subtarget);
24823 return performScalarToVectorCombine(N, DCI, DAG);
24824 }
24825 return SDValue();
24826}
24827
24828// Check if the return value is used as only a return value, as otherwise
24829// we can't perform a tail-call. In particular, we need to check for
24830// target ISD nodes that are returns and any other "odd" constructs
24831// that the generic analysis code won't necessarily catch.
24832bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24833 SDValue &Chain) const {
24834 if (N->getNumValues() != 1)
24835 return false;
24836 if (!N->hasNUsesOfValue(1, 0))
24837 return false;
24838
24839 SDValue TCChain = Chain;
24840 SDNode *Copy = *N->use_begin();
24841 if (Copy->getOpcode() == ISD::CopyToReg) {
24842 // If the copy has a glue operand, we conservatively assume it isn't safe to
24843 // perform a tail call.
24844 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24845 MVT::Glue)
24846 return false;
24847 TCChain = Copy->getOperand(0);
24848 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24849 return false;
24850
24851 bool HasRet = false;
24852 for (SDNode *Node : Copy->uses()) {
24853 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24854 return false;
24855 HasRet = true;
24856 }
24857
24858 if (!HasRet)
24859 return false;
24860
24861 Chain = TCChain;
24862 return true;
24863}
24864
24865// Return whether the an instruction can potentially be optimized to a tail
24866// call. This will cause the optimizers to attempt to move, or duplicate,
24867// return instructions to help enable tail call optimizations for this
24868// instruction.
24869bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24870 return CI->isTailCall();
24871}
24872
24873bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24874 Register Offset, bool IsPre,
24875 MachineRegisterInfo &MRI) const {
24876 auto CstOffset = getIConstantVRegVal(Offset, MRI);
24877 if (!CstOffset || CstOffset->isZero())
24878 return false;
24879
24880 // All of the indexed addressing mode instructions take a signed 9 bit
24881 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24882 // encodes the sign/indexing direction.
24883 return isInt<9>(CstOffset->getSExtValue());
24884}
24885
24886bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24887 SDValue &Base,
24888 SDValue &Offset,
24889 SelectionDAG &DAG) const {
24890 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24891 return false;
24892
24893 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24894 SDNode *ValOnlyUser = nullptr;
24895 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24896 ++UI) {
24897 if (UI.getUse().getResNo() == 1)
24898 continue; // Ignore chain.
24899 if (ValOnlyUser == nullptr)
24900 ValOnlyUser = *UI;
24901 else {
24902 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24903 break;
24904 }
24905 }
24906
24907 auto IsUndefOrZero = [](SDValue V) {
24908 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24909 };
24910
24911 // If the only user of the value is a scalable vector splat, it is
24912 // preferable to do a replicating load (ld1r*).
24913 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
24914 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24915 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24916 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
24917 return false;
24918
24919 Base = Op->getOperand(0);
24920 // All of the indexed addressing mode instructions take a signed
24921 // 9 bit immediate offset.
24922 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
24923 int64_t RHSC = RHS->getSExtValue();
24924 if (Op->getOpcode() == ISD::SUB)
24925 RHSC = -(uint64_t)RHSC;
24926 if (!isInt<9>(RHSC))
24927 return false;
24928 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24929 // when dealing with subtraction.
24930 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
24931 return true;
24932 }
24933 return false;
24934}
24935
24936bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24937 SDValue &Offset,
24939 SelectionDAG &DAG) const {
24940 EVT VT;
24941 SDValue Ptr;
24942 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24943 VT = LD->getMemoryVT();
24944 Ptr = LD->getBasePtr();
24945 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24946 VT = ST->getMemoryVT();
24947 Ptr = ST->getBasePtr();
24948 } else
24949 return false;
24950
24951 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
24952 return false;
24953 AM = ISD::PRE_INC;
24954 return true;
24955}
24956
24957bool AArch64TargetLowering::getPostIndexedAddressParts(
24959 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
24960 EVT VT;
24961 SDValue Ptr;
24962 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24963 VT = LD->getMemoryVT();
24964 Ptr = LD->getBasePtr();
24965 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24966 VT = ST->getMemoryVT();
24967 Ptr = ST->getBasePtr();
24968 } else
24969 return false;
24970
24971 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
24972 return false;
24973 // Post-indexing updates the base, so it's not a valid transform
24974 // if that's not the same as the load's pointer.
24975 if (Ptr != Base)
24976 return false;
24977 AM = ISD::POST_INC;
24978 return true;
24979}
24980
24983 SelectionDAG &DAG) {
24984 SDLoc DL(N);
24985 SDValue Op = N->getOperand(0);
24986 EVT VT = N->getValueType(0);
24987 [[maybe_unused]] EVT SrcVT = Op.getValueType();
24988 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24989 "Must be bool vector.");
24990
24991 // Special handling for Clang's __builtin_convertvector. For vectors with <8
24992 // elements, it adds a vector concatenation with undef(s). If we encounter
24993 // this here, we can skip the concat.
24994 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
24995 bool AllUndef = true;
24996 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
24997 AllUndef &= Op.getOperand(I).isUndef();
24998
24999 if (AllUndef)
25000 Op = Op.getOperand(0);
25001 }
25002
25003 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25004 if (VectorBits)
25005 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25006}
25007
25010 SelectionDAG &DAG, EVT ExtendVT,
25011 EVT CastVT) {
25012 SDLoc DL(N);
25013 SDValue Op = N->getOperand(0);
25014 EVT VT = N->getValueType(0);
25015
25016 // Use SCALAR_TO_VECTOR for lane zero
25017 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25018 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25019 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25020 Results.push_back(
25021 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25022}
25023
25024void AArch64TargetLowering::ReplaceBITCASTResults(
25026 SDLoc DL(N);
25027 SDValue Op = N->getOperand(0);
25028 EVT VT = N->getValueType(0);
25029 EVT SrcVT = Op.getValueType();
25030
25031 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25032 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25033 return;
25034 }
25035
25036 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25037 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25038 return;
25039 }
25040
25041 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25042 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25043 return;
25044 }
25045
25046 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25047 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25048 "Expected fp->int bitcast!");
25049
25050 // Bitcasting between unpacked vector types of different element counts is
25051 // not a NOP because the live elements are laid out differently.
25052 // 01234567
25053 // e.g. nxv2i32 = XX??XX??
25054 // nxv4f16 = X?X?X?X?
25055 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25056 return;
25057
25058 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25059 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25060 return;
25061 }
25062
25063 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25064 !VT.isVector())
25065 return replaceBoolVectorBitcast(N, Results, DAG);
25066
25067 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25068 return;
25069
25070 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25071 DAG.getUNDEF(MVT::i32), Op);
25072 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25073 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25074}
25075
25077 SelectionDAG &DAG,
25078 const AArch64Subtarget *Subtarget) {
25079 EVT VT = N->getValueType(0);
25080 if (!VT.is256BitVector() ||
25082 !N->getFlags().hasAllowReassociation()) ||
25083 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25084 VT.getScalarType() == MVT::bf16)
25085 return;
25086
25087 SDValue X = N->getOperand(0);
25088 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25089 if (!Shuf) {
25090 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25091 X = N->getOperand(1);
25092 if (!Shuf)
25093 return;
25094 }
25095
25096 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25097 return;
25098
25099 // Check the mask is 1,0,3,2,5,4,...
25100 ArrayRef<int> Mask = Shuf->getMask();
25101 for (int I = 0, E = Mask.size(); I < E; I++)
25102 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25103 return;
25104
25105 SDLoc DL(N);
25106 auto LoHi = DAG.SplitVector(X, DL);
25107 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25108 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25109 LoHi.first, LoHi.second);
25110
25111 // Shuffle the elements back into order.
25112 SmallVector<int> NMask;
25113 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25114 NMask.push_back(I);
25115 NMask.push_back(I);
25116 }
25117 Results.push_back(
25118 DAG.getVectorShuffle(VT, DL,
25119 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25120 DAG.getUNDEF(LoHi.first.getValueType())),
25121 DAG.getUNDEF(VT), NMask));
25122}
25123
25126 SelectionDAG &DAG, unsigned InterOp,
25127 unsigned AcrossOp) {
25128 EVT LoVT, HiVT;
25129 SDValue Lo, Hi;
25130 SDLoc dl(N);
25131 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25132 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25133 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25134 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25135 Results.push_back(SplitVal);
25136}
25137
25138void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25140 SDValue In = N->getOperand(0);
25141 EVT InVT = In.getValueType();
25142
25143 // Common code will handle these just fine.
25144 if (!InVT.isScalableVector() || !InVT.isInteger())
25145 return;
25146
25147 SDLoc DL(N);
25148 EVT VT = N->getValueType(0);
25149
25150 // The following checks bail if this is not a halving operation.
25151
25153
25154 if (InVT.getVectorElementCount() != (ResEC * 2))
25155 return;
25156
25157 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25158 if (!CIndex)
25159 return;
25160
25161 unsigned Index = CIndex->getZExtValue();
25162 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25163 return;
25164
25165 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25166 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25167
25168 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25169 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25170}
25171
25172// Create an even/odd pair of X registers holding integer value V.
25174 SDLoc dl(V.getNode());
25175 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25176 if (DAG.getDataLayout().isBigEndian())
25177 std::swap (VLo, VHi);
25178 SDValue RegClass =
25179 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25180 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25181 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25182 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25183 return SDValue(
25184 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25185}
25186
25189 SelectionDAG &DAG,
25190 const AArch64Subtarget *Subtarget) {
25191 assert(N->getValueType(0) == MVT::i128 &&
25192 "AtomicCmpSwap on types less than 128 should be legal");
25193
25194 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25195 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25196 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25197 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25198 SDValue Ops[] = {
25199 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25200 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25201 N->getOperand(1), // Ptr
25202 N->getOperand(0), // Chain in
25203 };
25204
25205 unsigned Opcode;
25206 switch (MemOp->getMergedOrdering()) {
25208 Opcode = AArch64::CASPX;
25209 break;
25211 Opcode = AArch64::CASPAX;
25212 break;
25214 Opcode = AArch64::CASPLX;
25215 break;
25218 Opcode = AArch64::CASPALX;
25219 break;
25220 default:
25221 llvm_unreachable("Unexpected ordering!");
25222 }
25223
25224 MachineSDNode *CmpSwap = DAG.getMachineNode(
25225 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25226 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25227
25228 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25229 if (DAG.getDataLayout().isBigEndian())
25230 std::swap(SubReg1, SubReg2);
25231 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25232 SDValue(CmpSwap, 0));
25233 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25234 SDValue(CmpSwap, 0));
25235 Results.push_back(
25236 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25237 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25238 return;
25239 }
25240
25241 unsigned Opcode;
25242 switch (MemOp->getMergedOrdering()) {
25244 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25245 break;
25247 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25248 break;
25250 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25251 break;
25254 Opcode = AArch64::CMP_SWAP_128;
25255 break;
25256 default:
25257 llvm_unreachable("Unexpected ordering!");
25258 }
25259
25260 SDLoc DL(N);
25261 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25262 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25263 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25264 New.first, New.second, N->getOperand(0)};
25265 SDNode *CmpSwap = DAG.getMachineNode(
25266 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25267 Ops);
25268 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25269
25270 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25271 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25272 Results.push_back(SDValue(CmpSwap, 3));
25273}
25274
25275static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25276 AtomicOrdering Ordering) {
25277 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25278 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25279 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25280 // ATOMIC_LOAD_CLR at any point.
25281 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25282 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25283 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25284 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25285
25286 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25287 // The operand will need to be XORed in a separate step.
25288 switch (Ordering) {
25290 return AArch64::LDCLRP;
25291 break;
25293 return AArch64::LDCLRPA;
25294 break;
25296 return AArch64::LDCLRPL;
25297 break;
25300 return AArch64::LDCLRPAL;
25301 break;
25302 default:
25303 llvm_unreachable("Unexpected ordering!");
25304 }
25305 }
25306
25307 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25308 switch (Ordering) {
25310 return AArch64::LDSETP;
25311 break;
25313 return AArch64::LDSETPA;
25314 break;
25316 return AArch64::LDSETPL;
25317 break;
25320 return AArch64::LDSETPAL;
25321 break;
25322 default:
25323 llvm_unreachable("Unexpected ordering!");
25324 }
25325 }
25326
25327 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25328 switch (Ordering) {
25330 return AArch64::SWPP;
25331 break;
25333 return AArch64::SWPPA;
25334 break;
25336 return AArch64::SWPPL;
25337 break;
25340 return AArch64::SWPPAL;
25341 break;
25342 default:
25343 llvm_unreachable("Unexpected ordering!");
25344 }
25345 }
25346
25347 llvm_unreachable("Unexpected ISDOpcode!");
25348}
25349
25352 SelectionDAG &DAG,
25353 const AArch64Subtarget *Subtarget) {
25354 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25355 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25356 // rather than the CASP instructions, because CASP has register classes for
25357 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25358 // to present them as single operands. LSE128 instructions use the GPR64
25359 // register class (because the pair does not have to be sequential), like
25360 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25361
25362 assert(N->getValueType(0) == MVT::i128 &&
25363 "AtomicLoadXXX on types less than 128 should be legal");
25364
25365 if (!Subtarget->hasLSE128())
25366 return;
25367
25368 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25369 const SDValue &Chain = N->getOperand(0);
25370 const SDValue &Ptr = N->getOperand(1);
25371 const SDValue &Val128 = N->getOperand(2);
25372 std::pair<SDValue, SDValue> Val2x64 =
25373 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25374
25375 const unsigned ISDOpcode = N->getOpcode();
25376 const unsigned MachineOpcode =
25377 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25378
25379 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25380 SDLoc dl(Val128);
25381 Val2x64.first =
25382 DAG.getNode(ISD::XOR, dl, MVT::i64,
25383 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25384 Val2x64.second =
25385 DAG.getNode(ISD::XOR, dl, MVT::i64,
25386 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25387 }
25388
25389 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25390 if (DAG.getDataLayout().isBigEndian())
25391 std::swap(Ops[0], Ops[1]);
25392
25393 MachineSDNode *AtomicInst =
25394 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25395 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25396
25397 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25398
25399 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25400 if (DAG.getDataLayout().isBigEndian())
25401 std::swap(Lo, Hi);
25402
25403 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25404 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25405}
25406
25407void AArch64TargetLowering::ReplaceNodeResults(
25409 switch (N->getOpcode()) {
25410 default:
25411 llvm_unreachable("Don't know how to custom expand this");
25412 case ISD::BITCAST:
25413 ReplaceBITCASTResults(N, Results, DAG);
25414 return;
25415 case ISD::VECREDUCE_ADD:
25420 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25421 return;
25422 case ISD::ADD:
25423 case ISD::FADD:
25424 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25425 return;
25426
25427 case ISD::CTPOP:
25428 case ISD::PARITY:
25429 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25430 Results.push_back(Result);
25431 return;
25432 case AArch64ISD::SADDV:
25434 return;
25435 case AArch64ISD::UADDV:
25437 return;
25438 case AArch64ISD::SMINV:
25440 return;
25441 case AArch64ISD::UMINV:
25443 return;
25444 case AArch64ISD::SMAXV:
25446 return;
25447 case AArch64ISD::UMAXV:
25449 return;
25450 case ISD::MULHS:
25452 Results.push_back(
25453 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25454 return;
25455 case ISD::MULHU:
25457 Results.push_back(
25458 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25459 return;
25460 case ISD::FP_TO_UINT:
25461 case ISD::FP_TO_SINT:
25464 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25465 // Let normal code take care of it by not adding anything to Results.
25466 return;
25468 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25469 return;
25471 assert(N->getValueType(0) != MVT::i128 &&
25472 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25473 break;
25476 case ISD::ATOMIC_SWAP: {
25477 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25478 "Expected 128-bit atomicrmw.");
25479 // These need custom type legalisation so we go directly to instruction.
25480 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25481 return;
25482 }
25483 case ISD::ATOMIC_LOAD:
25484 case ISD::LOAD: {
25485 MemSDNode *LoadNode = cast<MemSDNode>(N);
25486 EVT MemVT = LoadNode->getMemoryVT();
25487 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25488 // targets.
25489 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25490 MemVT.getSizeInBits() == 256u &&
25491 (MemVT.getScalarSizeInBits() == 8u ||
25492 MemVT.getScalarSizeInBits() == 16u ||
25493 MemVT.getScalarSizeInBits() == 32u ||
25494 MemVT.getScalarSizeInBits() == 64u)) {
25495
25498 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25499 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25500 MVT::Other}),
25501 {LoadNode->getChain(), LoadNode->getBasePtr()},
25502 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25503
25504 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25505 Result.getValue(0), Result.getValue(1));
25506 Results.append({Pair, Result.getValue(2) /* Chain */});
25507 return;
25508 }
25509
25510 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25511 LoadNode->getMemoryVT() != MVT::i128) {
25512 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25513 // optimizer.
25514 return;
25515 }
25516
25517 if (SDValue(N, 0).getValueType() == MVT::i128) {
25518 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25519 bool isLoadAcquire =
25521 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25522
25523 if (isLoadAcquire)
25524 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25525
25527 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25528 {LoadNode->getChain(), LoadNode->getBasePtr()},
25529 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25530
25531 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25532
25533 SDValue Pair =
25534 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25535 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25536 Results.append({Pair, Result.getValue(2) /* Chain */});
25537 }
25538 return;
25539 }
25541 ReplaceExtractSubVectorResults(N, Results, DAG);
25542 return;
25545 // Custom lowering has been requested for INSERT_SUBVECTOR and
25546 // CONCAT_VECTORS -- but delegate to common code for result type
25547 // legalisation
25548 return;
25550 EVT VT = N->getValueType(0);
25551 assert((VT == MVT::i8 || VT == MVT::i16) &&
25552 "custom lowering for unexpected type");
25553
25554 Intrinsic::ID IntID =
25555 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25556 switch (IntID) {
25557 default:
25558 return;
25559 case Intrinsic::aarch64_sve_clasta_n: {
25560 SDLoc DL(N);
25561 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25562 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25563 N->getOperand(1), Op2, N->getOperand(3));
25564 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25565 return;
25566 }
25567 case Intrinsic::aarch64_sve_clastb_n: {
25568 SDLoc DL(N);
25569 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25570 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25571 N->getOperand(1), Op2, N->getOperand(3));
25572 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25573 return;
25574 }
25575 case Intrinsic::aarch64_sve_lasta: {
25576 SDLoc DL(N);
25577 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25578 N->getOperand(1), N->getOperand(2));
25579 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25580 return;
25581 }
25582 case Intrinsic::aarch64_sve_lastb: {
25583 SDLoc DL(N);
25584 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25585 N->getOperand(1), N->getOperand(2));
25586 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25587 return;
25588 }
25589 }
25590 }
25591 case ISD::READ_REGISTER: {
25592 SDLoc DL(N);
25593 assert(N->getValueType(0) == MVT::i128 &&
25594 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25595 SDValue Chain = N->getOperand(0);
25596 SDValue SysRegName = N->getOperand(1);
25597
25598 SDValue Result = DAG.getNode(
25599 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25600 Chain, SysRegName);
25601
25602 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25603 // of the 128-bit System Register value.
25604 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25605 Result.getValue(0), Result.getValue(1));
25606 Results.push_back(Pair);
25607 Results.push_back(Result.getValue(2)); // Chain
25608 return;
25609 }
25610 }
25611}
25612
25614 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25616 return true;
25617}
25618
25619unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25620 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25621 // reciprocal if there are three or more FDIVs.
25622 return 3;
25623}
25624
25627 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25628 // v4i16, v2i32 instead of to promote.
25629 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25630 VT == MVT::v1f32)
25631 return TypeWidenVector;
25632
25634}
25635
25636// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25637// provided the address is 16-byte aligned.
25639 if (!Subtarget->hasLSE2())
25640 return false;
25641
25642 if (auto LI = dyn_cast<LoadInst>(I))
25643 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25644 LI->getAlign() >= Align(16);
25645
25646 if (auto SI = dyn_cast<StoreInst>(I))
25647 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25648 SI->getAlign() >= Align(16);
25649
25650 return false;
25651}
25652
25654 if (!Subtarget->hasLSE128())
25655 return false;
25656
25657 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25658 // will clobber the two registers.
25659 if (const auto *SI = dyn_cast<StoreInst>(I))
25660 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25661 SI->getAlign() >= Align(16) &&
25662 (SI->getOrdering() == AtomicOrdering::Release ||
25663 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25664
25665 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25666 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25667 RMW->getAlign() >= Align(16) &&
25668 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25669 RMW->getOperation() == AtomicRMWInst::And ||
25670 RMW->getOperation() == AtomicRMWInst::Or);
25671
25672 return false;
25673}
25674
25676 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25677 return false;
25678
25679 if (auto LI = dyn_cast<LoadInst>(I))
25680 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25681 LI->getAlign() >= Align(16) &&
25682 LI->getOrdering() == AtomicOrdering::Acquire;
25683
25684 if (auto SI = dyn_cast<StoreInst>(I))
25685 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25686 SI->getAlign() >= Align(16) &&
25687 SI->getOrdering() == AtomicOrdering::Release;
25688
25689 return false;
25690}
25691
25693 const Instruction *I) const {
25695 return false;
25697 return false;
25699 return true;
25700 return false;
25701}
25702
25704 const Instruction *I) const {
25705 // Store-Release instructions only provide seq_cst guarantees when paired with
25706 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25707 // implement seq_cst loads and stores, so we need additional explicit fences
25708 // after memory writes.
25709 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25710 return false;
25711
25712 switch (I->getOpcode()) {
25713 default:
25714 return false;
25715 case Instruction::AtomicCmpXchg:
25716 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25718 case Instruction::AtomicRMW:
25719 return cast<AtomicRMWInst>(I)->getOrdering() ==
25721 case Instruction::Store:
25722 return cast<StoreInst>(I)->getOrdering() ==
25724 }
25725}
25726
25727// Loads and stores less than 128-bits are already atomic; ones above that
25728// are doomed anyway, so defer to the default libcall and blame the OS when
25729// things go wrong.
25732 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25733 if (Size != 128)
25735 if (isOpSuitableForRCPC3(SI))
25737 if (isOpSuitableForLSE128(SI))
25739 if (isOpSuitableForLDPSTP(SI))
25742}
25743
25744// Loads and stores less than 128-bits are already atomic; ones above that
25745// are doomed anyway, so defer to the default libcall and blame the OS when
25746// things go wrong.
25749 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25750
25751 if (Size != 128)
25753 if (isOpSuitableForRCPC3(LI))
25755 // No LSE128 loads
25756 if (isOpSuitableForLDPSTP(LI))
25758
25759 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25760 // implement atomicrmw without spilling. If the target address is also on the
25761 // stack and close enough to the spill slot, this can lead to a situation
25762 // where the monitor always gets cleared and the atomic operation can never
25763 // succeed. So at -O0 lower this operation to a CAS loop.
25764 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25766
25767 // Using CAS for an atomic load has a better chance of succeeding under high
25768 // contention situations. So use it if available.
25769 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25771}
25772
25773// The "default" for integer RMW operations is to expand to an LL/SC loop.
25774// However, with the LSE instructions (or outline-atomics mode, which provides
25775// library routines in place of the LSE-instructions), we can directly emit many
25776// operations instead.
25777//
25778// Floating-point operations are always emitted to a cmpxchg loop, because they
25779// may trigger a trap which aborts an LLSC sequence.
25782 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25783 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25784
25785 if (AI->isFloatingPointOperation())
25787
25788 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25792 if (CanUseLSE128)
25794
25795 // Nand is not supported in LSE.
25796 // Leave 128 bits to LLSC or CmpXChg.
25797 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25798 if (Subtarget->hasLSE())
25800 if (Subtarget->outlineAtomics()) {
25801 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25802 // Don't outline them unless
25803 // (1) high level <atomic> support approved:
25804 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25805 // (2) low level libgcc and compiler-rt support implemented by:
25806 // min/max outline atomics helpers
25807 if (AI->getOperation() != AtomicRMWInst::Min &&
25812 }
25813 }
25814 }
25815
25816 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25817 // implement atomicrmw without spilling. If the target address is also on the
25818 // stack and close enough to the spill slot, this can lead to a situation
25819 // where the monitor always gets cleared and the atomic operation can never
25820 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25821 // we have a single CAS instruction that can replace the loop.
25823 Subtarget->hasLSE())
25825
25827}
25828
25831 AtomicCmpXchgInst *AI) const {
25832 // If subtarget has LSE, leave cmpxchg intact for codegen.
25833 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25835 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25836 // implement cmpxchg without spilling. If the address being exchanged is also
25837 // on the stack and close enough to the spill slot, this can lead to a
25838 // situation where the monitor always gets cleared and the atomic operation
25839 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25840 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25842
25843 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25844 // it.
25846 if (Size > 64)
25848
25850}
25851
25853 Type *ValueTy, Value *Addr,
25854 AtomicOrdering Ord) const {
25855 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25856 bool IsAcquire = isAcquireOrStronger(Ord);
25857
25858 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25859 // intrinsic must return {i64, i64} and we have to recombine them into a
25860 // single i128 here.
25861 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25863 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25865
25866 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25867
25868 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25869 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25870 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25871 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25872 return Builder.CreateOr(
25873 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25874 }
25875
25876 Type *Tys[] = { Addr->getType() };
25878 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25879 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25880
25881 const DataLayout &DL = M->getDataLayout();
25882 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25883 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25884 CI->addParamAttr(
25885 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25886 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25887
25888 return Builder.CreateBitCast(Trunc, ValueTy);
25889}
25890
25892 IRBuilderBase &Builder) const {
25893 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25894 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25895}
25896
25898 Value *Val, Value *Addr,
25899 AtomicOrdering Ord) const {
25900 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25901 bool IsRelease = isReleaseOrStronger(Ord);
25902
25903 // Since the intrinsics must have legal type, the i128 intrinsics take two
25904 // parameters: "i64, i64". We must marshal Val into the appropriate form
25905 // before the call.
25906 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25908 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25910 Type *Int64Ty = Type::getInt64Ty(M->getContext());
25911
25912 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
25913 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
25914 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
25915 }
25916
25918 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25919 Type *Tys[] = { Addr->getType() };
25920 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
25921
25922 const DataLayout &DL = M->getDataLayout();
25923 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
25924 Val = Builder.CreateBitCast(Val, IntValTy);
25925
25926 CallInst *CI = Builder.CreateCall(
25927 Stxr, {Builder.CreateZExtOrBitCast(
25928 Val, Stxr->getFunctionType()->getParamType(0)),
25929 Addr});
25930 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
25931 Attribute::ElementType, Val->getType()));
25932 return CI;
25933}
25934
25936 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
25937 const DataLayout &DL) const {
25938 if (!Ty->isArrayTy()) {
25939 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
25940 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
25941 }
25942
25943 // All non aggregate members of the type must have the same type
25944 SmallVector<EVT> ValueVTs;
25945 ComputeValueVTs(*this, DL, Ty, ValueVTs);
25946 return all_equal(ValueVTs);
25947}
25948
25949bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
25950 EVT) const {
25951 return false;
25952}
25953
25954static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
25955 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
25956 Function *ThreadPointerFunc =
25957 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
25958 return IRB.CreatePointerCast(
25959 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
25960 Offset),
25961 IRB.getPtrTy(0));
25962}
25963
25965 // Android provides a fixed TLS slot for the stack cookie. See the definition
25966 // of TLS_SLOT_STACK_GUARD in
25967 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
25968 if (Subtarget->isTargetAndroid())
25969 return UseTlsOffset(IRB, 0x28);
25970
25971 // Fuchsia is similar.
25972 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
25973 if (Subtarget->isTargetFuchsia())
25974 return UseTlsOffset(IRB, -0x10);
25975
25977}
25978
25980 // MSVC CRT provides functionalities for stack protection.
25981 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
25982 // MSVC CRT has a global variable holding security cookie.
25983 M.getOrInsertGlobal("__security_cookie",
25984 PointerType::getUnqual(M.getContext()));
25985
25986 // MSVC CRT has a function to validate security cookie.
25987 FunctionCallee SecurityCheckCookie =
25988 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
25989 Type::getVoidTy(M.getContext()),
25990 PointerType::getUnqual(M.getContext()));
25991 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
25992 F->setCallingConv(CallingConv::Win64);
25993 F->addParamAttr(0, Attribute::AttrKind::InReg);
25994 }
25995 return;
25996 }
25998}
25999
26001 // MSVC CRT has a global variable holding security cookie.
26002 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26003 return M.getGlobalVariable("__security_cookie");
26005}
26006
26008 // MSVC CRT has a function to validate security cookie.
26009 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26010 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26012}
26013
26014Value *
26016 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26017 // definition of TLS_SLOT_SAFESTACK in
26018 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26019 if (Subtarget->isTargetAndroid())
26020 return UseTlsOffset(IRB, 0x48);
26021
26022 // Fuchsia is similar.
26023 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26024 if (Subtarget->isTargetFuchsia())
26025 return UseTlsOffset(IRB, -0x8);
26026
26028}
26029
26031 const Instruction &AndI) const {
26032 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26033 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26034 // may be beneficial to sink in other cases, but we would have to check that
26035 // the cmp would not get folded into the br to form a cbz for these to be
26036 // beneficial.
26037 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26038 if (!Mask)
26039 return false;
26040 return Mask->getValue().isPowerOf2();
26041}
26042
26046 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26047 SelectionDAG &DAG) const {
26048 // Does baseline recommend not to perform the fold by default?
26050 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26051 return false;
26052 // Else, if this is a vector shift, prefer 'shl'.
26053 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26054}
26055
26058 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26060 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26063 ExpansionFactor);
26064}
26065
26067 // Update IsSplitCSR in AArch64unctionInfo.
26068 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26069 AFI->setIsSplitCSR(true);
26070}
26071
26073 MachineBasicBlock *Entry,
26074 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26075 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26076 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26077 if (!IStart)
26078 return;
26079
26080 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26081 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26082 MachineBasicBlock::iterator MBBI = Entry->begin();
26083 for (const MCPhysReg *I = IStart; *I; ++I) {
26084 const TargetRegisterClass *RC = nullptr;
26085 if (AArch64::GPR64RegClass.contains(*I))
26086 RC = &AArch64::GPR64RegClass;
26087 else if (AArch64::FPR64RegClass.contains(*I))
26088 RC = &AArch64::FPR64RegClass;
26089 else
26090 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26091
26092 Register NewVR = MRI->createVirtualRegister(RC);
26093 // Create copy from CSR to a virtual register.
26094 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26095 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26096 // nounwind. If we want to generalize this later, we may need to emit
26097 // CFI pseudo-instructions.
26098 assert(Entry->getParent()->getFunction().hasFnAttribute(
26099 Attribute::NoUnwind) &&
26100 "Function should be nounwind in insertCopiesSplitCSR!");
26101 Entry->addLiveIn(*I);
26102 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26103 .addReg(*I);
26104
26105 // Insert the copy-back instructions right before the terminator.
26106 for (auto *Exit : Exits)
26107 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26108 TII->get(TargetOpcode::COPY), *I)
26109 .addReg(NewVR);
26110 }
26111}
26112
26114 // Integer division on AArch64 is expensive. However, when aggressively
26115 // optimizing for code size, we prefer to use a div instruction, as it is
26116 // usually smaller than the alternative sequence.
26117 // The exception to this is vector division. Since AArch64 doesn't have vector
26118 // integer division, leaving the division as-is is a loss even in terms of
26119 // size, because it will have to be scalarized, while the alternative code
26120 // sequence can be performed in vector form.
26121 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26122 return OptSize && !VT.isVector();
26123}
26124
26126 // We want inc-of-add for scalars and sub-of-not for vectors.
26127 return VT.isScalarInteger();
26128}
26129
26131 EVT VT) const {
26132 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26133 // legalize.
26134 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26135 return false;
26136 if (FPVT == MVT::v8bf16)
26137 return false;
26138 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26139}
26140
26144 const TargetInstrInfo *TII) const {
26145 assert(MBBI->isCall() && MBBI->getCFIType() &&
26146 "Invalid call instruction for a KCFI check");
26147
26148 switch (MBBI->getOpcode()) {
26149 case AArch64::BLR:
26150 case AArch64::BLRNoIP:
26151 case AArch64::TCRETURNri:
26152 case AArch64::TCRETURNrix16x17:
26153 case AArch64::TCRETURNrix17:
26154 case AArch64::TCRETURNrinotx16:
26155 break;
26156 default:
26157 llvm_unreachable("Unexpected CFI call opcode");
26158 }
26159
26160 MachineOperand &Target = MBBI->getOperand(0);
26161 assert(Target.isReg() && "Invalid target operand for an indirect call");
26162 Target.setIsRenamable(false);
26163
26164 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26165 .addReg(Target.getReg())
26166 .addImm(MBBI->getCFIType())
26167 .getInstr();
26168}
26169
26171 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26172}
26173
26174unsigned
26176 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26177 return getPointerTy(DL).getSizeInBits();
26178
26179 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26180}
26181
26182void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26183 MachineFrameInfo &MFI = MF.getFrameInfo();
26184 // If we have any vulnerable SVE stack objects then the stack protector
26185 // needs to be placed at the top of the SVE stack area, as the SVE locals
26186 // are placed above the other locals, so we allocate it as if it were a
26187 // scalable vector.
26188 // FIXME: It may be worthwhile having a specific interface for this rather
26189 // than doing it here in finalizeLowering.
26190 if (MFI.hasStackProtectorIndex()) {
26191 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26197 break;
26198 }
26199 }
26200 }
26203}
26204
26205// Unlike X86, we let frame lowering assign offsets to all catch objects.
26207 return false;
26208}
26209
26210bool AArch64TargetLowering::shouldLocalize(
26211 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26212 auto &MF = *MI.getMF();
26213 auto &MRI = MF.getRegInfo();
26214 auto maxUses = [](unsigned RematCost) {
26215 // A cost of 1 means remats are basically free.
26216 if (RematCost == 1)
26217 return std::numeric_limits<unsigned>::max();
26218 if (RematCost == 2)
26219 return 2U;
26220
26221 // Remat is too expensive, only sink if there's one user.
26222 if (RematCost > 2)
26223 return 1U;
26224 llvm_unreachable("Unexpected remat cost");
26225 };
26226
26227 unsigned Opc = MI.getOpcode();
26228 switch (Opc) {
26229 case TargetOpcode::G_GLOBAL_VALUE: {
26230 // On Darwin, TLS global vars get selected into function calls, which
26231 // we don't want localized, as they can get moved into the middle of a
26232 // another call sequence.
26233 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26234 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26235 return false;
26236 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26237 }
26238 case TargetOpcode::G_FCONSTANT:
26239 case TargetOpcode::G_CONSTANT: {
26240 const ConstantInt *CI;
26241 unsigned AdditionalCost = 0;
26242
26243 if (Opc == TargetOpcode::G_CONSTANT)
26244 CI = MI.getOperand(1).getCImm();
26245 else {
26246 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26247 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26248 // materialized as integers.
26249 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26250 break;
26251 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26252 bool OptForSize =
26255 OptForSize))
26256 return true; // Constant should be cheap.
26257 CI =
26258 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26259 // FP materialization also costs an extra move, from gpr to fpr.
26260 AdditionalCost = 1;
26261 }
26262 APInt Imm = CI->getValue();
26265 assert(Cost.isValid() && "Expected a valid imm cost");
26266
26267 unsigned RematCost = *Cost.getValue();
26268 RematCost += AdditionalCost;
26269 Register Reg = MI.getOperand(0).getReg();
26270 unsigned MaxUses = maxUses(RematCost);
26271 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26272 if (MaxUses == std::numeric_limits<unsigned>::max())
26273 --MaxUses;
26274 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26275 }
26276 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26277 // localizable.
26278 case AArch64::ADRP:
26279 case AArch64::G_ADD_LOW:
26280 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26281 case TargetOpcode::G_PTR_ADD:
26282 return true;
26283 default:
26284 break;
26285 }
26287}
26288
26290 if (Inst.getType()->isScalableTy())
26291 return true;
26292
26293 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26294 if (Inst.getOperand(i)->getType()->isScalableTy())
26295 return true;
26296
26297 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26298 if (AI->getAllocatedType()->isScalableTy())
26299 return true;
26300 }
26301
26302 // Checks to allow the use of SME instructions
26303 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26304 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26305 auto CalleeAttrs = SMEAttrs(*Base);
26306 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26307 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26308 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26309 return true;
26310 }
26311 return false;
26312}
26313
26314// Return the largest legal scalable vector type that matches VT's element type.
26318 "Expected legal fixed length vector!");
26319 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26320 default:
26321 llvm_unreachable("unexpected element type for SVE container");
26322 case MVT::i8:
26323 return EVT(MVT::nxv16i8);
26324 case MVT::i16:
26325 return EVT(MVT::nxv8i16);
26326 case MVT::i32:
26327 return EVT(MVT::nxv4i32);
26328 case MVT::i64:
26329 return EVT(MVT::nxv2i64);
26330 case MVT::bf16:
26331 return EVT(MVT::nxv8bf16);
26332 case MVT::f16:
26333 return EVT(MVT::nxv8f16);
26334 case MVT::f32:
26335 return EVT(MVT::nxv4f32);
26336 case MVT::f64:
26337 return EVT(MVT::nxv2f64);
26338 }
26339}
26340
26341// Return a PTRUE with active lanes corresponding to the extent of VT.
26343 EVT VT) {
26346 "Expected legal fixed length vector!");
26347
26348 std::optional<unsigned> PgPattern =
26350 assert(PgPattern && "Unexpected element count for SVE predicate");
26351
26352 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26353 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26354 // variants of instructions when available.
26355 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26356 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26357 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26358 if (MaxSVESize && MinSVESize == MaxSVESize &&
26359 MaxSVESize == VT.getSizeInBits())
26360 PgPattern = AArch64SVEPredPattern::all;
26361
26362 MVT MaskVT;
26363 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26364 default:
26365 llvm_unreachable("unexpected element type for SVE predicate");
26366 case MVT::i8:
26367 MaskVT = MVT::nxv16i1;
26368 break;
26369 case MVT::i16:
26370 case MVT::f16:
26371 case MVT::bf16:
26372 MaskVT = MVT::nxv8i1;
26373 break;
26374 case MVT::i32:
26375 case MVT::f32:
26376 MaskVT = MVT::nxv4i1;
26377 break;
26378 case MVT::i64:
26379 case MVT::f64:
26380 MaskVT = MVT::nxv2i1;
26381 break;
26382 }
26383
26384 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26385}
26386
26388 EVT VT) {
26390 "Expected legal scalable vector!");
26391 auto PredTy = VT.changeVectorElementType(MVT::i1);
26392 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26393}
26394
26396 if (VT.isFixedLengthVector())
26397 return getPredicateForFixedLengthVector(DAG, DL, VT);
26398
26399 return getPredicateForScalableVector(DAG, DL, VT);
26400}
26401
26402// Grow V to consume an entire SVE register.
26404 assert(VT.isScalableVector() &&
26405 "Expected to convert into a scalable vector!");
26406 assert(V.getValueType().isFixedLengthVector() &&
26407 "Expected a fixed length vector operand!");
26408 SDLoc DL(V);
26409 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26410 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26411}
26412
26413// Shrink V so it's just big enough to maintain a VT's worth of data.
26416 "Expected to convert into a fixed length vector!");
26417 assert(V.getValueType().isScalableVector() &&
26418 "Expected a scalable vector operand!");
26419 SDLoc DL(V);
26420 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26421 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26422}
26423
26424// Convert all fixed length vector loads larger than NEON to masked_loads.
26425SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26426 SDValue Op, SelectionDAG &DAG) const {
26427 auto Load = cast<LoadSDNode>(Op);
26428
26429 SDLoc DL(Op);
26430 EVT VT = Op.getValueType();
26431 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26432 EVT LoadVT = ContainerVT;
26433 EVT MemVT = Load->getMemoryVT();
26434
26435 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26436
26437 if (VT.isFloatingPoint()) {
26438 LoadVT = ContainerVT.changeTypeToInteger();
26439 MemVT = MemVT.changeTypeToInteger();
26440 }
26441
26442 SDValue NewLoad = DAG.getMaskedLoad(
26443 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26444 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26445 Load->getAddressingMode(), Load->getExtensionType());
26446
26447 SDValue Result = NewLoad;
26448 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26449 EVT ExtendVT = ContainerVT.changeVectorElementType(
26450 Load->getMemoryVT().getVectorElementType());
26451
26452 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26454 Pg, Result, DAG.getUNDEF(ContainerVT));
26455 } else if (VT.isFloatingPoint()) {
26456 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26457 }
26458
26459 Result = convertFromScalableVector(DAG, VT, Result);
26460 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26461 return DAG.getMergeValues(MergedValues, DL);
26462}
26463
26465 SelectionDAG &DAG) {
26466 SDLoc DL(Mask);
26467 EVT InVT = Mask.getValueType();
26468 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26469
26470 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26471
26472 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26473 return Pg;
26474
26475 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26476 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26477
26479 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26480}
26481
26482// Convert all fixed length vector loads larger than NEON to masked_loads.
26483SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26484 SDValue Op, SelectionDAG &DAG) const {
26485 auto Load = cast<MaskedLoadSDNode>(Op);
26486
26487 SDLoc DL(Op);
26488 EVT VT = Op.getValueType();
26489 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26490
26491 SDValue Mask = Load->getMask();
26492 // If this is an extending load and the mask type is not the same as
26493 // load's type then we have to extend the mask type.
26494 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26495 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26496 "Incorrect mask type");
26497 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26498 }
26500
26501 SDValue PassThru;
26502 bool IsPassThruZeroOrUndef = false;
26503
26504 if (Load->getPassThru()->isUndef()) {
26505 PassThru = DAG.getUNDEF(ContainerVT);
26506 IsPassThruZeroOrUndef = true;
26507 } else {
26508 if (ContainerVT.isInteger())
26509 PassThru = DAG.getConstant(0, DL, ContainerVT);
26510 else
26511 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26512 if (isZerosVector(Load->getPassThru().getNode()))
26513 IsPassThruZeroOrUndef = true;
26514 }
26515
26516 SDValue NewLoad = DAG.getMaskedLoad(
26517 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26518 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26519 Load->getAddressingMode(), Load->getExtensionType());
26520
26521 SDValue Result = NewLoad;
26522 if (!IsPassThruZeroOrUndef) {
26523 SDValue OldPassThru =
26524 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26525 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26526 }
26527
26528 Result = convertFromScalableVector(DAG, VT, Result);
26529 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26530 return DAG.getMergeValues(MergedValues, DL);
26531}
26532
26533// Convert all fixed length vector stores larger than NEON to masked_stores.
26534SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26535 SDValue Op, SelectionDAG &DAG) const {
26536 auto Store = cast<StoreSDNode>(Op);
26537
26538 SDLoc DL(Op);
26539 EVT VT = Store->getValue().getValueType();
26540 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26541 EVT MemVT = Store->getMemoryVT();
26542
26543 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26544 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26545
26546 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26547 EVT TruncVT = ContainerVT.changeVectorElementType(
26548 Store->getMemoryVT().getVectorElementType());
26549 MemVT = MemVT.changeTypeToInteger();
26550 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26551 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26552 DAG.getUNDEF(TruncVT));
26553 NewValue =
26554 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26555 } else if (VT.isFloatingPoint()) {
26556 MemVT = MemVT.changeTypeToInteger();
26557 NewValue =
26558 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26559 }
26560
26561 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26562 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26563 Store->getMemOperand(), Store->getAddressingMode(),
26564 Store->isTruncatingStore());
26565}
26566
26567SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26568 SDValue Op, SelectionDAG &DAG) const {
26569 auto *Store = cast<MaskedStoreSDNode>(Op);
26570
26571 SDLoc DL(Op);
26572 EVT VT = Store->getValue().getValueType();
26573 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26574
26575 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26577
26578 return DAG.getMaskedStore(
26579 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26580 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26581 Store->getAddressingMode(), Store->isTruncatingStore());
26582}
26583
26584SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26585 SDValue Op, SelectionDAG &DAG) const {
26586 SDLoc dl(Op);
26587 EVT VT = Op.getValueType();
26588 EVT EltVT = VT.getVectorElementType();
26589
26590 bool Signed = Op.getOpcode() == ISD::SDIV;
26591 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26592
26593 bool Negated;
26594 uint64_t SplatVal;
26595 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26596 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26597 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26598 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26599
26600 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26601 SDValue Res =
26602 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26603 if (Negated)
26604 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26605 DAG.getConstant(0, dl, ContainerVT), Res);
26606
26607 return convertFromScalableVector(DAG, VT, Res);
26608 }
26609
26610 // Scalable vector i32/i64 DIV is supported.
26611 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26612 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26613
26614 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26615 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26616 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26617 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26618
26619 // If the wider type is legal: extend, op, and truncate.
26620 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26621 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26622 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26623 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26624 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26625 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26626 }
26627
26628 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26629 &ExtendOpcode](SDValue Op) {
26630 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26631 SDValue IdxHalf =
26632 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26633 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26634 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26635 return std::pair<SDValue, SDValue>(
26636 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26637 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26638 };
26639
26640 // If wider type is not legal: split, extend, op, trunc and concat.
26641 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26642 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26643 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26644 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26645 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26646 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26647 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26648}
26649
26650SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26651 SDValue Op, SelectionDAG &DAG) const {
26652 EVT VT = Op.getValueType();
26653 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26654
26655 SDLoc DL(Op);
26656 SDValue Val = Op.getOperand(0);
26657 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26658 Val = convertToScalableVector(DAG, ContainerVT, Val);
26659
26660 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26661 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26662
26663 // Repeatedly unpack Val until the result is of the desired element type.
26664 switch (ContainerVT.getSimpleVT().SimpleTy) {
26665 default:
26666 llvm_unreachable("unimplemented container type");
26667 case MVT::nxv16i8:
26668 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26669 if (VT.getVectorElementType() == MVT::i16)
26670 break;
26671 [[fallthrough]];
26672 case MVT::nxv8i16:
26673 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26674 if (VT.getVectorElementType() == MVT::i32)
26675 break;
26676 [[fallthrough]];
26677 case MVT::nxv4i32:
26678 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26679 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26680 break;
26681 }
26682
26683 return convertFromScalableVector(DAG, VT, Val);
26684}
26685
26686SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26687 SDValue Op, SelectionDAG &DAG) const {
26688 EVT VT = Op.getValueType();
26689 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26690
26691 SDLoc DL(Op);
26692 SDValue Val = Op.getOperand(0);
26693 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26694 Val = convertToScalableVector(DAG, ContainerVT, Val);
26695
26696 // Repeatedly truncate Val until the result is of the desired element type.
26697 switch (ContainerVT.getSimpleVT().SimpleTy) {
26698 default:
26699 llvm_unreachable("unimplemented container type");
26700 case MVT::nxv2i64:
26701 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26702 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26703 if (VT.getVectorElementType() == MVT::i32)
26704 break;
26705 [[fallthrough]];
26706 case MVT::nxv4i32:
26707 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26708 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26709 if (VT.getVectorElementType() == MVT::i16)
26710 break;
26711 [[fallthrough]];
26712 case MVT::nxv8i16:
26713 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26714 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26715 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26716 break;
26717 }
26718
26719 return convertFromScalableVector(DAG, VT, Val);
26720}
26721
26722SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26723 SDValue Op, SelectionDAG &DAG) const {
26724 EVT VT = Op.getValueType();
26725 EVT InVT = Op.getOperand(0).getValueType();
26726 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26727
26728 SDLoc DL(Op);
26729 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26730 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26731
26732 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26733}
26734
26735SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26736 SDValue Op, SelectionDAG &DAG) const {
26737 EVT VT = Op.getValueType();
26738 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26739
26740 SDLoc DL(Op);
26741 EVT InVT = Op.getOperand(0).getValueType();
26742 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26743 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26744
26745 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26746 Op.getOperand(1), Op.getOperand(2));
26747
26748 return convertFromScalableVector(DAG, VT, ScalableRes);
26749}
26750
26751// Convert vector operation 'Op' to an equivalent predicated operation whereby
26752// the original operation's type is used to construct a suitable predicate.
26753// NOTE: The results for inactive lanes are undefined.
26754SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26755 SelectionDAG &DAG,
26756 unsigned NewOp) const {
26757 EVT VT = Op.getValueType();
26758 SDLoc DL(Op);
26759 auto Pg = getPredicateForVector(DAG, DL, VT);
26760
26761 if (VT.isFixedLengthVector()) {
26762 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26763 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26764
26765 // Create list of operands by converting existing ones to scalable types.
26767 for (const SDValue &V : Op->op_values()) {
26768 if (isa<CondCodeSDNode>(V)) {
26769 Operands.push_back(V);
26770 continue;
26771 }
26772
26773 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26774 EVT VTArg = VTNode->getVT().getVectorElementType();
26775 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26776 Operands.push_back(DAG.getValueType(NewVTArg));
26777 continue;
26778 }
26779
26780 assert(isTypeLegal(V.getValueType()) &&
26781 "Expected only legal fixed-width types");
26782 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26783 }
26784
26785 if (isMergePassthruOpcode(NewOp))
26786 Operands.push_back(DAG.getUNDEF(ContainerVT));
26787
26788 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26789 return convertFromScalableVector(DAG, VT, ScalableRes);
26790 }
26791
26792 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26793
26795 for (const SDValue &V : Op->op_values()) {
26796 assert((!V.getValueType().isVector() ||
26797 V.getValueType().isScalableVector()) &&
26798 "Only scalable vectors are supported!");
26799 Operands.push_back(V);
26800 }
26801
26802 if (isMergePassthruOpcode(NewOp))
26803 Operands.push_back(DAG.getUNDEF(VT));
26804
26805 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26806}
26807
26808// If a fixed length vector operation has no side effects when applied to
26809// undefined elements, we can safely use scalable vectors to perform the same
26810// operation without needing to worry about predication.
26811SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26812 SelectionDAG &DAG) const {
26813 EVT VT = Op.getValueType();
26815 "Only expected to lower fixed length vector operation!");
26816 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26817
26818 // Create list of operands by converting existing ones to scalable types.
26820 for (const SDValue &V : Op->op_values()) {
26821 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26822
26823 // Pass through non-vector operands.
26824 if (!V.getValueType().isVector()) {
26825 Ops.push_back(V);
26826 continue;
26827 }
26828
26829 // "cast" fixed length vector to a scalable vector.
26830 assert(V.getValueType().isFixedLengthVector() &&
26831 isTypeLegal(V.getValueType()) &&
26832 "Only fixed length vectors are supported!");
26833 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26834 }
26835
26836 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26837 return convertFromScalableVector(DAG, VT, ScalableRes);
26838}
26839
26840SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26841 SelectionDAG &DAG) const {
26842 SDLoc DL(ScalarOp);
26843 SDValue AccOp = ScalarOp.getOperand(0);
26844 SDValue VecOp = ScalarOp.getOperand(1);
26845 EVT SrcVT = VecOp.getValueType();
26846 EVT ResVT = SrcVT.getVectorElementType();
26847
26848 EVT ContainerVT = SrcVT;
26849 if (SrcVT.isFixedLengthVector()) {
26850 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26851 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26852 }
26853
26854 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26855 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26856
26857 // Convert operands to Scalable.
26858 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26859 DAG.getUNDEF(ContainerVT), AccOp, Zero);
26860
26861 // Perform reduction.
26862 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26863 Pg, AccOp, VecOp);
26864
26865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26866}
26867
26868SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26869 SelectionDAG &DAG) const {
26870 SDLoc DL(ReduceOp);
26871 SDValue Op = ReduceOp.getOperand(0);
26872 EVT OpVT = Op.getValueType();
26873 EVT VT = ReduceOp.getValueType();
26874
26875 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26876 return SDValue();
26877
26878 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26879
26880 switch (ReduceOp.getOpcode()) {
26881 default:
26882 return SDValue();
26883 case ISD::VECREDUCE_OR:
26884 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26885 // The predicate can be 'Op' because
26886 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26887 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26888 else
26889 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26890 case ISD::VECREDUCE_AND: {
26891 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26892 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26893 }
26894 case ISD::VECREDUCE_XOR: {
26895 SDValue ID =
26896 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26897 if (OpVT == MVT::nxv1i1) {
26898 // Emulate a CNTP on .Q using .D and a different governing predicate.
26899 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26900 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26901 }
26902 SDValue Cntp =
26903 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26904 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
26905 }
26906 }
26907
26908 return SDValue();
26909}
26910
26911SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26912 SDValue ScalarOp,
26913 SelectionDAG &DAG) const {
26914 SDLoc DL(ScalarOp);
26915 SDValue VecOp = ScalarOp.getOperand(0);
26916 EVT SrcVT = VecOp.getValueType();
26917
26919 SrcVT,
26920 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
26921 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26922 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26923 }
26924
26925 // UADDV always returns an i64 result.
26926 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26927 SrcVT.getVectorElementType();
26928 EVT RdxVT = SrcVT;
26929 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
26930 RdxVT = getPackedSVEVectorVT(ResVT);
26931
26932 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26933 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
26934 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26935 Rdx, DAG.getConstant(0, DL, MVT::i64));
26936
26937 // The VEC_REDUCE nodes expect an element size result.
26938 if (ResVT != ScalarOp.getValueType())
26939 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
26940
26941 return Res;
26942}
26943
26944SDValue
26945AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
26946 SelectionDAG &DAG) const {
26947 EVT VT = Op.getValueType();
26948 SDLoc DL(Op);
26949
26950 EVT InVT = Op.getOperand(1).getValueType();
26951 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26952 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
26953 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
26954
26955 // Convert the mask to a predicated (NOTE: We don't need to worry about
26956 // inactive lanes since VSELECT is safe when given undefined elements).
26957 EVT MaskVT = Op.getOperand(0).getValueType();
26958 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
26959 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
26961 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
26962
26963 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
26964 Mask, Op1, Op2);
26965
26966 return convertFromScalableVector(DAG, VT, ScalableRes);
26967}
26968
26969SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
26970 SDValue Op, SelectionDAG &DAG) const {
26971 SDLoc DL(Op);
26972 EVT InVT = Op.getOperand(0).getValueType();
26973 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26974
26975 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
26976 "Only expected to lower fixed length vector operation!");
26977 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
26978 "Expected integer result of the same bit length as the inputs!");
26979
26980 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26981 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
26982 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26983
26984 EVT CmpVT = Pg.getValueType();
26985 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
26986 {Pg, Op1, Op2, Op.getOperand(2)});
26987
26988 EVT PromoteVT = ContainerVT.changeTypeToInteger();
26989 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
26990 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
26991}
26992
26993SDValue
26994AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
26995 SelectionDAG &DAG) const {
26996 SDLoc DL(Op);
26997 auto SrcOp = Op.getOperand(0);
26998 EVT VT = Op.getValueType();
26999 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27000 EVT ContainerSrcVT =
27001 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27002
27003 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27004 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27005 return convertFromScalableVector(DAG, VT, Op);
27006}
27007
27008SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27009 SDValue Op, SelectionDAG &DAG) const {
27010 SDLoc DL(Op);
27011 unsigned NumOperands = Op->getNumOperands();
27012
27013 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27014 "Unexpected number of operands in CONCAT_VECTORS");
27015
27016 auto SrcOp1 = Op.getOperand(0);
27017 auto SrcOp2 = Op.getOperand(1);
27018 EVT VT = Op.getValueType();
27019 EVT SrcVT = SrcOp1.getValueType();
27020
27021 if (NumOperands > 2) {
27023 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27024 for (unsigned I = 0; I < NumOperands; I += 2)
27025 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27026 Op->getOperand(I), Op->getOperand(I + 1)));
27027
27028 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27029 }
27030
27031 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27032
27034 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27035 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27036
27037 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27038
27039 return convertFromScalableVector(DAG, VT, Op);
27040}
27041
27042SDValue
27043AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27044 SelectionDAG &DAG) const {
27045 EVT VT = Op.getValueType();
27046 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27047
27048 SDLoc DL(Op);
27049 SDValue Val = Op.getOperand(0);
27050 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27051 EVT SrcVT = Val.getValueType();
27052 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27053 EVT ExtendVT = ContainerVT.changeVectorElementType(
27054 SrcVT.getVectorElementType());
27055
27056 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27057 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27058
27059 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27060 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27061 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27062 Pg, Val, DAG.getUNDEF(ContainerVT));
27063
27064 return convertFromScalableVector(DAG, VT, Val);
27065}
27066
27067SDValue
27068AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27069 SelectionDAG &DAG) const {
27070 EVT VT = Op.getValueType();
27071 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27072
27073 SDLoc DL(Op);
27074 SDValue Val = Op.getOperand(0);
27075 EVT SrcVT = Val.getValueType();
27076 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27077 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27079 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27080
27081 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27082 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27083 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27084 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27085 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27086
27087 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27088 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27089}
27090
27091SDValue
27092AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27093 SelectionDAG &DAG) const {
27094 EVT VT = Op.getValueType();
27095 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27096
27097 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27098 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27100
27101 SDLoc DL(Op);
27102 SDValue Val = Op.getOperand(0);
27103 EVT SrcVT = Val.getValueType();
27104 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27105 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27106
27107 if (VT.bitsGE(SrcVT)) {
27109
27110 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27111 VT.changeTypeToInteger(), Val);
27112
27113 // Safe to use a larger than specified operand because by promoting the
27114 // value nothing has changed from an arithmetic point of view.
27115 Val =
27116 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27117 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27118 DAG.getUNDEF(ContainerDstVT));
27119 return convertFromScalableVector(DAG, VT, Val);
27120 } else {
27121 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27122 ContainerDstVT.getVectorElementType());
27124
27125 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27126 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27127 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27128 Val = convertFromScalableVector(DAG, SrcVT, Val);
27129
27130 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27131 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27132 }
27133}
27134
27135SDValue
27136AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27137 SelectionDAG &DAG) const {
27138 SDLoc DL(Op);
27139 EVT OpVT = Op.getValueType();
27140 assert(OpVT.isScalableVector() &&
27141 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27142 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27143 Op.getOperand(1));
27144 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27145 Op.getOperand(1));
27146 return DAG.getMergeValues({Even, Odd}, DL);
27147}
27148
27149SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27150 SelectionDAG &DAG) const {
27151 SDLoc DL(Op);
27152 EVT OpVT = Op.getValueType();
27153 assert(OpVT.isScalableVector() &&
27154 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27155
27156 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27157 Op.getOperand(1));
27158 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27159 Op.getOperand(1));
27160 return DAG.getMergeValues({Lo, Hi}, DL);
27161}
27162
27163SDValue
27164AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27165 SelectionDAG &DAG) const {
27166 EVT VT = Op.getValueType();
27167 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27168
27169 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27170 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27172
27173 SDLoc DL(Op);
27174 SDValue Val = Op.getOperand(0);
27175 EVT SrcVT = Val.getValueType();
27176 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27177 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27178
27179 if (VT.bitsGT(SrcVT)) {
27180 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27181 ContainerSrcVT.getVectorElementType());
27183
27184 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27185 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27186
27187 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27188 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27189 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27190 DAG.getUNDEF(ContainerDstVT));
27191 return convertFromScalableVector(DAG, VT, Val);
27192 } else {
27193 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27195
27196 // Safe to use a larger than specified result since an fp_to_int where the
27197 // result doesn't fit into the destination is undefined.
27198 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27199 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27200 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27201
27202 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27203 }
27204}
27205
27207 ArrayRef<int> ShuffleMask, EVT VT,
27208 EVT ContainerVT, SelectionDAG &DAG) {
27209 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27210 SDLoc DL(Op);
27211 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27212 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27213 bool IsSingleOp =
27214 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27215
27216 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27217 MinSVESize = 128;
27218
27219 // Ignore two operands if no SVE2 or all index numbers couldn't
27220 // be represented.
27221 if (!IsSingleOp && !Subtarget.hasSVE2())
27222 return SDValue();
27223
27224 EVT VTOp1 = Op.getOperand(0).getValueType();
27225 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27226 unsigned IndexLen = MinSVESize / BitsPerElt;
27227 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27228 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27229 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27230 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27231 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27232 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27233 "Incorrectly legalised shuffle operation");
27234
27236 // If MinSVESize is not equal to MaxSVESize then we need to know which
27237 // TBL mask element needs adjustment.
27238 SmallVector<SDValue, 8> AddRuntimeVLMask;
27239
27240 // Bail out for 8-bits element types, because with 2048-bit SVE register
27241 // size 8 bits is only sufficient to index into the first source vector.
27242 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27243 return SDValue();
27244
27245 for (int Index : ShuffleMask) {
27246 // Handling poison index value.
27247 if (Index < 0)
27248 Index = 0;
27249 // If the mask refers to elements in the second operand, then we have to
27250 // offset the index by the number of elements in a vector. If this is number
27251 // is not known at compile-time, we need to maintain a mask with 'VL' values
27252 // to add at runtime.
27253 if ((unsigned)Index >= ElementsPerVectorReg) {
27254 if (MinMaxEqual) {
27255 Index += IndexLen - ElementsPerVectorReg;
27256 } else {
27257 Index = Index - ElementsPerVectorReg;
27258 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27259 }
27260 } else if (!MinMaxEqual)
27261 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27262 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27263 // to 255, this might point to the last element of in the second operand
27264 // of the shufflevector, thus we are rejecting this transform.
27265 if ((unsigned)Index >= MaxOffset)
27266 return SDValue();
27267 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27268 }
27269
27270 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27271 // value where it would perform first lane duplication for out of
27272 // index elements. For i8 elements an out-of-range index could be a valid
27273 // for 2048-bit vector register size.
27274 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27275 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27276 if (!MinMaxEqual)
27277 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27278 }
27279
27280 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27281 SDValue VecMask =
27282 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27283 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27284
27285 SDValue Shuffle;
27286 if (IsSingleOp)
27287 Shuffle =
27288 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27289 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27290 Op1, SVEMask);
27291 else if (Subtarget.hasSVE2()) {
27292 if (!MinMaxEqual) {
27293 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27294 SDValue VScale = (BitsPerElt == 64)
27295 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27296 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27297 SDValue VecMask =
27298 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27299 SDValue MulByMask = DAG.getNode(
27300 ISD::MUL, DL, MaskType,
27301 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27302 DAG.getBuildVector(MaskType, DL,
27303 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27304 SDValue UpdatedVecMask =
27305 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27306 SVEMask = convertToScalableVector(
27307 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27308 }
27309 Shuffle =
27310 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27311 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27312 Op1, Op2, SVEMask);
27313 }
27314 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27315 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27316}
27317
27318SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27319 SDValue Op, SelectionDAG &DAG) const {
27320 EVT VT = Op.getValueType();
27321 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27322
27323 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27324 auto ShuffleMask = SVN->getMask();
27325
27326 SDLoc DL(Op);
27327 SDValue Op1 = Op.getOperand(0);
27328 SDValue Op2 = Op.getOperand(1);
27329
27330 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27331 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27332 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27333
27334 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27335 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27336 return MVT::i32;
27337 return ScalarTy;
27338 };
27339
27340 if (SVN->isSplat()) {
27341 unsigned Lane = std::max(0, SVN->getSplatIndex());
27342 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27343 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27344 DAG.getConstant(Lane, DL, MVT::i64));
27345 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27346 return convertFromScalableVector(DAG, VT, Op);
27347 }
27348
27349 bool ReverseEXT = false;
27350 unsigned Imm;
27351 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27352 Imm == VT.getVectorNumElements() - 1) {
27353 if (ReverseEXT)
27354 std::swap(Op1, Op2);
27355 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27356 SDValue Scalar = DAG.getNode(
27357 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27358 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27359 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27360 return convertFromScalableVector(DAG, VT, Op);
27361 }
27362
27363 for (unsigned LaneSize : {64U, 32U, 16U}) {
27364 if (isREVMask(ShuffleMask, VT, LaneSize)) {
27365 EVT NewVT =
27367 unsigned RevOp;
27368 unsigned EltSz = VT.getScalarSizeInBits();
27369 if (EltSz == 8)
27371 else if (EltSz == 16)
27373 else
27375
27376 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27377 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27378 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27379 return convertFromScalableVector(DAG, VT, Op);
27380 }
27381 }
27382
27383 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
27384 isREVMask(ShuffleMask, VT, 128)) {
27385 if (!VT.isFloatingPoint())
27386 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27387
27389 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27390 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27391 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27392 return convertFromScalableVector(DAG, VT, Op);
27393 }
27394
27395 unsigned WhichResult;
27396 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27398 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27399
27400 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
27401 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27403 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27404 }
27405
27406 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27408 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27409
27410 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27411 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27413 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27414 }
27415
27416 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27417 // represents the same logical operation as performed by a ZIP instruction. In
27418 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27419 // equivalent to an AArch64 instruction. There's the extra component of
27420 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27421 // only operated on 64/128bit vector types that have a direct mapping to a
27422 // target register and so an exact mapping is implied.
27423 // However, when using SVE for fixed length vectors, most legal vector types
27424 // are actually sub-vectors of a larger SVE register. When mapping
27425 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27426 // how the mask's indices translate. Specifically, when the mapping requires
27427 // an exact meaning for a specific vector index (e.g. Index X is the last
27428 // vector element in the register) then such mappings are often only safe when
27429 // the exact SVE register size is know. The main exception to this is when
27430 // indices are logically relative to the first element of either
27431 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27432 // when converting from fixed-length to scalable vector types (i.e. the start
27433 // of a fixed length vector is always the start of a scalable vector).
27434 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27435 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27436 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27437 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27438 Op2.isUndef()) {
27439 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27440 return convertFromScalableVector(DAG, VT, Op);
27441 }
27442
27443 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27445 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27446
27447 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
27448 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27450 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27451 }
27452
27453 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27455 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27456
27457 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27458 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27460 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27461 }
27462 }
27463
27464 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27465 // unless NEON is not available and we can assume minimal SVE register size is
27466 // 128-bits.
27467 if (MinSVESize || !Subtarget->isNeonAvailable())
27468 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27469 DAG);
27470
27471 return SDValue();
27472}
27473
27474SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27475 SelectionDAG &DAG) const {
27476 SDLoc DL(Op);
27477 EVT InVT = Op.getValueType();
27478
27479 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27480 InVT.isScalableVector() && isTypeLegal(InVT) &&
27481 "Only expect to cast between legal scalable vector types!");
27482 assert(VT.getVectorElementType() != MVT::i1 &&
27483 InVT.getVectorElementType() != MVT::i1 &&
27484 "For predicate bitcasts, use getSVEPredicateBitCast");
27485
27486 if (InVT == VT)
27487 return Op;
27488
27490 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27491
27492 // Safe bitcasting between unpacked vector types of different element counts
27493 // is currently unsupported because the following is missing the necessary
27494 // work to ensure the result's elements live where they're supposed to within
27495 // an SVE register.
27496 // 01234567
27497 // e.g. nxv2i32 = XX??XX??
27498 // nxv4f16 = X?X?X?X?
27500 VT == PackedVT || InVT == PackedInVT) &&
27501 "Unexpected bitcast!");
27502
27503 // Pack input if required.
27504 if (InVT != PackedInVT)
27505 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27506
27507 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27508
27509 // Unpack result if required.
27510 if (VT != PackedVT)
27512
27513 return Op;
27514}
27515
27517 SDValue N) const {
27518 return ::isAllActivePredicate(DAG, N);
27519}
27520
27522 return ::getPromotedVTForPredicate(VT);
27523}
27524
27525bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27526 SDValue Op, const APInt &OriginalDemandedBits,
27527 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27528 unsigned Depth) const {
27529
27530 unsigned Opc = Op.getOpcode();
27531 switch (Opc) {
27532 case AArch64ISD::VSHL: {
27533 // Match (VSHL (VLSHR Val X) X)
27534 SDValue ShiftL = Op;
27535 SDValue ShiftR = Op->getOperand(0);
27536 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27537 return false;
27538
27539 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27540 return false;
27541
27542 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27543 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27544
27545 // Other cases can be handled as well, but this is not
27546 // implemented.
27547 if (ShiftRBits != ShiftLBits)
27548 return false;
27549
27550 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27551 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27552
27553 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27554 APInt UnusedBits = ~OriginalDemandedBits;
27555
27556 if ((ZeroBits & UnusedBits) != ZeroBits)
27557 return false;
27558
27559 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27560 // used - simplify to just Val.
27561 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27562 }
27564 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27565 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27566 if (!MaxSVEVectorSizeInBits)
27567 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27568 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27569 // The SVE count intrinsics don't support the multiplier immediate so we
27570 // don't have to account for that here. The value returned may be slightly
27571 // over the true required bits, as this is based on the "ALL" pattern. The
27572 // other patterns are also exposed by these intrinsics, but they all
27573 // return a value that's strictly less than "ALL".
27574 unsigned RequiredBits = llvm::bit_width(MaxElements);
27575 unsigned BitWidth = Known.Zero.getBitWidth();
27576 if (RequiredBits < BitWidth)
27577 Known.Zero.setHighBits(BitWidth - RequiredBits);
27578 return false;
27579 }
27580 }
27581 }
27582
27584 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27585}
27586
27587bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27588 return Op.getOpcode() == AArch64ISD::DUP ||
27589 Op.getOpcode() == AArch64ISD::MOVI ||
27590 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27591 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27593}
27594
27596 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27597 Subtarget->hasComplxNum();
27598}
27599
27602 auto *VTy = dyn_cast<VectorType>(Ty);
27603 if (!VTy)
27604 return false;
27605
27606 // If the vector is scalable, SVE is enabled, implying support for complex
27607 // numbers. Otherwise, we need to ensure complex number support is available
27608 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27609 return false;
27610
27611 auto *ScalarTy = VTy->getScalarType();
27612 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27613
27614 // We can only process vectors that have a bit size of 128 or higher (with an
27615 // additional 64 bits for Neon). Additionally, these vectors must have a
27616 // power-of-2 size, as we later split them into the smallest supported size
27617 // and merging them back together after applying complex operation.
27618 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27619 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27620 !llvm::isPowerOf2_32(VTyWidth))
27621 return false;
27622
27623 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27624 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27625 return 8 <= ScalarWidth && ScalarWidth <= 64;
27626 }
27627
27628 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27629 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27630}
27631
27634 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27635 Value *Accumulator) const {
27636 VectorType *Ty = cast<VectorType>(InputA->getType());
27637 bool IsScalable = Ty->isScalableTy();
27638 bool IsInt = Ty->getElementType()->isIntegerTy();
27639
27640 unsigned TyWidth =
27642
27643 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27644 "Vector type must be either 64 or a power of 2 that is at least 128");
27645
27646 if (TyWidth > 128) {
27647 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27648 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27649 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27650 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27651 auto *UpperSplitA =
27652 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27653 auto *UpperSplitB =
27654 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27655 Value *LowerSplitAcc = nullptr;
27656 Value *UpperSplitAcc = nullptr;
27657 if (Accumulator) {
27658 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27659 UpperSplitAcc =
27660 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27661 }
27662 auto *LowerSplitInt = createComplexDeinterleavingIR(
27663 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27664 auto *UpperSplitInt = createComplexDeinterleavingIR(
27665 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27666
27667 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27668 B.getInt64(0));
27669 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27670 }
27671
27672 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27673 if (Accumulator == nullptr)
27675
27676 if (IsScalable) {
27677 if (IsInt)
27678 return B.CreateIntrinsic(
27679 Intrinsic::aarch64_sve_cmla_x, Ty,
27680 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27681
27682 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27683 return B.CreateIntrinsic(
27684 Intrinsic::aarch64_sve_fcmla, Ty,
27685 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27686 }
27687
27688 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27689 Intrinsic::aarch64_neon_vcmla_rot90,
27690 Intrinsic::aarch64_neon_vcmla_rot180,
27691 Intrinsic::aarch64_neon_vcmla_rot270};
27692
27693
27694 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27695 {Accumulator, InputA, InputB});
27696 }
27697
27698 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27699 if (IsScalable) {
27702 if (IsInt)
27703 return B.CreateIntrinsic(
27704 Intrinsic::aarch64_sve_cadd_x, Ty,
27705 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27706
27707 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27708 return B.CreateIntrinsic(
27709 Intrinsic::aarch64_sve_fcadd, Ty,
27710 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27711 }
27712 return nullptr;
27713 }
27714
27717 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27719 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27720
27721 if (IntId == Intrinsic::not_intrinsic)
27722 return nullptr;
27723
27724 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27725 }
27726
27727 return nullptr;
27728}
27729
27730bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27731 unsigned Opc = N->getOpcode();
27732 if (ISD::isExtOpcode(Opc)) {
27733 if (any_of(N->uses(),
27734 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27735 return false;
27736 }
27737 return true;
27738}
27739
27740unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27741 return Subtarget->getMinimumJumpTableEntries();
27742}
27743
27746 EVT VT) const {
27747 bool NonUnitFixedLengthVector =
27749 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27751
27752 EVT VT1;
27753 MVT RegisterVT;
27754 unsigned NumIntermediates;
27755 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27756 RegisterVT);
27757 return RegisterVT;
27758}
27759
27761 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27762 bool NonUnitFixedLengthVector =
27764 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27766
27767 EVT VT1;
27768 MVT VT2;
27769 unsigned NumIntermediates;
27771 NumIntermediates, VT2);
27772}
27773
27775 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27776 unsigned &NumIntermediates, MVT &RegisterVT) const {
27778 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27779 if (!RegisterVT.isFixedLengthVector() ||
27780 RegisterVT.getFixedSizeInBits() <= 128)
27781 return NumRegs;
27782
27783 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27784 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27785 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27786
27787 // A size mismatch here implies either type promotion or widening and would
27788 // have resulted in scalarisation if larger vectors had not be available.
27789 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27790 EVT EltTy = VT.getVectorElementType();
27792 if (!isTypeLegal(NewVT))
27793 NewVT = EltTy;
27794
27795 IntermediateVT = NewVT;
27796 NumIntermediates = VT.getVectorNumElements();
27797 RegisterVT = getRegisterType(Context, NewVT);
27798 return NumIntermediates;
27799 }
27800
27801 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27802 // types for vector arguments and returns.
27803
27804 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27805 NumIntermediates *= NumSubRegs;
27806 NumRegs *= NumSubRegs;
27807
27808 switch (RegisterVT.getVectorElementType().SimpleTy) {
27809 default:
27810 llvm_unreachable("unexpected element type for vector");
27811 case MVT::i8:
27812 IntermediateVT = RegisterVT = MVT::v16i8;
27813 break;
27814 case MVT::i16:
27815 IntermediateVT = RegisterVT = MVT::v8i16;
27816 break;
27817 case MVT::i32:
27818 IntermediateVT = RegisterVT = MVT::v4i32;
27819 break;
27820 case MVT::i64:
27821 IntermediateVT = RegisterVT = MVT::v2i64;
27822 break;
27823 case MVT::f16:
27824 IntermediateVT = RegisterVT = MVT::v8f16;
27825 break;
27826 case MVT::f32:
27827 IntermediateVT = RegisterVT = MVT::v4f32;
27828 break;
27829 case MVT::f64:
27830 IntermediateVT = RegisterVT = MVT::v2f64;
27831 break;
27832 case MVT::bf16:
27833 IntermediateVT = RegisterVT = MVT::v8bf16;
27834 break;
27835 }
27836
27837 return NumRegs;
27838}
27839
27841 const MachineFunction &MF) const {
27842 return !Subtarget->isTargetWindows() &&
27843 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27844}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsLess, bool IsEqual)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static const unsigned PerfectShuffleTable[6561+1]
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1896
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1934
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1941
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:205
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1648
unsigned arg_size() const
Definition: InstrTypes.h:1646
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1832
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:204
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:144
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:680
bool empty() const
Definition: Function.h:804
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:200
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:677
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1874
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:338
arg_iterator arg_end()
Definition: Function.h:822
arg_iterator arg_begin()
Definition: Function.h:813
size_t size() const
Definition: Function.h:803
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:342
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:669
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:528
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2120
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2006
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1031
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2455
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1880
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2506
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1039
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:533
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2153
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2499
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:460
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2056
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2105
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1431
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:470
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2070
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:485
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2110
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1410
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2010
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2477
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2100
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1491
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:563
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2395
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1865
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:510
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:80
const BasicBlock * getParent() const
Definition: Instruction.h:151
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:84
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:251
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
void computeMaxCallFrameSize(const MachineFunction &MF)
Computes the maximum size of a callframe and the AdjustsStack property.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:690
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
void addCallSiteInfo(const SDNode *Node, CallSiteInfoImpl &&CallInfo)
Set CallSiteInfo to be associated with Node.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getStepVector(const SDLoc &DL, EVT ResVT, APInt StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:466
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:680
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:651
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:618
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1126
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1122
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:476
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1339
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1370
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1155
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1241
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:436
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1355
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1359
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1369
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:477
@ STRICT_FLOG2
Definition: ISDOpcodes.h:421
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1267
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1268
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:411
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1400
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:885
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:662
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:450
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1352
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1221
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1356
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:988
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:758
@ STRICT_LROUND
Definition: ISDOpcodes.h:431
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1077
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:327
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1052
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1056
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:586
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:646
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ STRICT_FPOWI
Definition: ISDOpcodes.h:413
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1237
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1371
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:627
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1151
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1364
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1266
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1265
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:435
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:424
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1211
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:856
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:425
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1329
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1248
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1215
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1041
@ STRICT_LRINT
Definition: ISDOpcodes.h:433
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:591
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ STRICT_FROUND
Definition: ISDOpcodes.h:428
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:449
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1372
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1263
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1264
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1182
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1208
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:657
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:400
@ STRICT_FLOG10
Definition: ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:434
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:612
@ STRICT_FEXP2
Definition: ISDOpcodes.h:418
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1262
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:432
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:855
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1360
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1146
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1070
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:580
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:313
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1594
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1485
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1472
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1503
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1474
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1451
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:765
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:821
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:163
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:862
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:293
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:228
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:332
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:269
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1438
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:319
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:246
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:233
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2060
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2048
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:422
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:364
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:279
Structure used to represent pair of argument number after call lowering and register used to transfer...
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64