LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
845
847 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
850 } else {
853 }
856
857 // Generate outline atomics library calls only if LSE was not specified for
858 // subtarget
859 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
885#define LCALLNAMES(A, B, N) \
886 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
887 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
888 setLibcallName(A##N##_REL, #B #N "_rel"); \
889 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
890#define LCALLNAME4(A, B) \
891 LCALLNAMES(A, B, 1) \
892 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
893#define LCALLNAME5(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) \
896 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
897 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
898 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
899 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
900 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
903#undef LCALLNAMES
904#undef LCALLNAME4
905#undef LCALLNAME5
906 }
907
908 if (Subtarget->hasLSE128()) {
909 // Custom lowering because i128 is not legal. Must be replaced by 2x64
910 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
914 }
915
916 // 128-bit loads and stores can be done without expanding
919
920 // Aligned 128-bit loads and stores are single-copy atomic according to the
921 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
922 if (Subtarget->hasLSE2()) {
925 }
926
927 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
928 // custom lowering, as there are no un-paired non-temporal stores and
929 // legalization will break up 256 bit inputs.
931 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
932 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
933 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
938
939 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
940 // custom lowering, as there are no un-paired non-temporal loads legalization
941 // will break up 256 bit inputs.
942 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
943 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
944 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
945 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
946 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
947 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
948 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
949 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
950
951 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
953
954 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
955 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
956 // Issue __sincos_stret if available.
959 } else {
962 }
963
964 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
965 // MSVCRT doesn't have powi; fall back to pow
966 setLibcallName(RTLIB::POWI_F32, nullptr);
967 setLibcallName(RTLIB::POWI_F64, nullptr);
968 }
969
970 // Make floating-point constants legal for the large code model, so they don't
971 // become loads from the constant pool.
972 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
975 }
976
977 // AArch64 does not have floating-point extending loads, i1 sign-extending
978 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
979 for (MVT VT : MVT::fp_valuetypes()) {
980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
981 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
982 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
985 }
986 for (MVT VT : MVT::integer_valuetypes())
988
989 for (MVT WideVT : MVT::fp_valuetypes()) {
990 for (MVT NarrowVT : MVT::fp_valuetypes()) {
991 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
992 setTruncStoreAction(WideVT, NarrowVT, Expand);
993 }
994 }
995 }
996
997 if (Subtarget->hasFPARMv8()) {
1001 }
1002
1003 // Indexed loads and stores are supported.
1004 for (unsigned im = (unsigned)ISD::PRE_INC;
1006 setIndexedLoadAction(im, MVT::i8, Legal);
1007 setIndexedLoadAction(im, MVT::i16, Legal);
1008 setIndexedLoadAction(im, MVT::i32, Legal);
1009 setIndexedLoadAction(im, MVT::i64, Legal);
1010 setIndexedLoadAction(im, MVT::f64, Legal);
1011 setIndexedLoadAction(im, MVT::f32, Legal);
1012 setIndexedLoadAction(im, MVT::f16, Legal);
1013 setIndexedLoadAction(im, MVT::bf16, Legal);
1014 setIndexedStoreAction(im, MVT::i8, Legal);
1015 setIndexedStoreAction(im, MVT::i16, Legal);
1016 setIndexedStoreAction(im, MVT::i32, Legal);
1017 setIndexedStoreAction(im, MVT::i64, Legal);
1018 setIndexedStoreAction(im, MVT::f64, Legal);
1019 setIndexedStoreAction(im, MVT::f32, Legal);
1020 setIndexedStoreAction(im, MVT::f16, Legal);
1021 setIndexedStoreAction(im, MVT::bf16, Legal);
1022 }
1023
1024 // Trap.
1025 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1028
1029 // We combine OR nodes for bitfield operations.
1031 // Try to create BICs for vector ANDs.
1033
1034 // Vector add and sub nodes may conceal a high-half opportunity.
1035 // Also, try to fold ADD into CSINC/CSINV..
1038
1041
1042 // Try and combine setcc with csel
1044
1046
1053
1055
1057
1059
1063
1065
1067
1069
1071
1075
1077
1078 // In case of strict alignment, avoid an excessive number of byte wide stores.
1081 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1082
1086 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1087
1090
1093 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1094
1096
1098
1099 EnableExtLdPromotion = true;
1100
1101 // Set required alignment.
1103 // Set preferred alignments.
1104
1105 // Don't align loops on Windows. The SEH unwind info generation needs to
1106 // know the exact length of functions before the alignments have been
1107 // expanded.
1108 if (!Subtarget->isTargetWindows())
1112
1113 // Only change the limit for entries in a jump table if specified by
1114 // the sub target, but not at the command line.
1115 unsigned MaxJT = STI.getMaximumJumpTableSize();
1116 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1118
1120
1122
1124
1125 if (Subtarget->hasNEON()) {
1126 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1127 // silliness like this:
1128 for (auto Op :
1146 setOperationAction(Op, MVT::v1f64, Expand);
1147
1148 for (auto Op :
1153 setOperationAction(Op, MVT::v1i64, Expand);
1154
1155 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1156 // elements smaller than i32, so promote the input to i32 first.
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1159
1160 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1161 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1162 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1165 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1167
1168 if (Subtarget->hasFullFP16()) {
1171
1180 } else {
1181 // when AArch64 doesn't have fullfp16 support, promote the input
1182 // to i32 first.
1183 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1184 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1185 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1188 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1191 }
1192
1193 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1194 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1201 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1206 }
1207
1208 // Custom handling for some quad-vector types to detect MULL.
1209 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1210 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1211 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1212 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1215
1216 // Saturates
1217 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1218 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1223 }
1224
1225 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1226 MVT::v4i32}) {
1233 }
1234
1235 // Vector reductions
1236 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1237 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1238 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1243
1245 }
1246 }
1247 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1248 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1257 }
1262
1264 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1265 // Likewise, narrowing and extending vector loads/stores aren't handled
1266 // directly.
1269
1270 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1273 } else {
1276 }
1279
1282
1283 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1284 setTruncStoreAction(VT, InnerVT, Expand);
1285 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1286 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1287 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1288 }
1289 }
1290
1291 // AArch64 has implementations of a lot of rounding-like FP operations.
1292 for (auto Op :
1297 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1299 if (Subtarget->hasFullFP16())
1300 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1302 }
1303
1304 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1305
1310
1314
1315 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1317 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1320 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1321
1322 // ADDP custom lowering
1323 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1325 // FADDP custom lowering
1326 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1328 }
1329
1330 if (Subtarget->hasSME()) {
1332 }
1333
1334 // FIXME: Move lowering for more nodes here if those are common between
1335 // SVE and SME.
1336 if (Subtarget->hasSVEorSME()) {
1337 for (auto VT :
1338 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1343 }
1344 }
1345
1346 if (Subtarget->hasSVEorSME()) {
1347 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1390
1396
1405
1410
1411 if (!Subtarget->isLittleEndian())
1413
1414 if (Subtarget->hasSVE2orSME())
1415 // For SLI/SRI.
1417 }
1418
1419 // Illegal unpacked integer vector types.
1420 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1423 }
1424
1425 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1426 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1427 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1429
1430 for (auto VT :
1431 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1432 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1434
1435 for (auto VT :
1436 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1444
1448
1449 // There are no legal MVT::nxv16f## based types.
1450 if (VT != MVT::nxv16i1) {
1453 }
1454 }
1455
1456 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1457 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1458 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1459 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1464 }
1465
1466 // Firstly, exclude all scalable vector extending loads/truncating stores,
1467 // include both integer and floating scalable vector.
1469 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1470 setTruncStoreAction(VT, InnerVT, Expand);
1471 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1472 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1473 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1474 }
1475 }
1476
1477 // Then, selectively enable those which we directly support.
1478 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1479 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1480 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1481 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1482 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1483 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1484 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1485 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1486 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1487 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1488 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1489 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1490 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1491 }
1492
1493 // SVE supports truncating stores of 64 and 128-bit vectors
1494 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1495 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1496 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1497 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1498 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1499
1500 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1501 MVT::nxv4f32, MVT::nxv2f64}) {
1537 if (Subtarget->isSVEAvailable())
1542
1556
1568
1569 if (!Subtarget->isLittleEndian())
1571 }
1572
1573 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1580
1581 if (!Subtarget->isLittleEndian())
1583 }
1584
1587
1588 // NEON doesn't support integer divides, but SVE does
1589 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1590 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1593 }
1594
1595 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1596 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1597 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1598
1599 if (Subtarget->isSVEAvailable()) {
1600 // NEON doesn't support across-vector reductions, but SVE does.
1601 for (auto VT :
1602 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1604 }
1605
1606 if (!Subtarget->isNeonAvailable()) {
1607 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Custom);
1608 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Custom);
1609 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Custom);
1610 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Custom);
1611 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Custom);
1612 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
1613 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
1614 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
1615 setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
1616 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
1617 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
1618 setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
1619 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
1620 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
1621 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1622 MVT::v4i32, MVT::v1i64, MVT::v2i64})
1623 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1624
1625 for (MVT VT :
1626 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1627 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1628 }
1629
1630 // NOTE: Currently this has to happen after computeRegisterProperties rather
1631 // than the preferred option of combining it with the addRegisterClass call.
1632 if (Subtarget->useSVEForFixedLengthVectors()) {
1635 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1638 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1639
1640 // 64bit results can mean a bigger than NEON input.
1641 for (auto VT : {MVT::v8i8, MVT::v4i16})
1644
1645 // 128bit results imply a bigger than NEON input.
1646 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1648 for (auto VT : {MVT::v8f16, MVT::v4f32})
1650
1651 // These operations are not supported on NEON but SVE can do them.
1653 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1654 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1655 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1656 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1657 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1658 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1659 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1660 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1661 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1662 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1663 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1664 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1665 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1666 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1667 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1672
1673 // Int operations with no NEON support.
1674 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1675 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1683 }
1684
1685
1686 // Use SVE for vectors with more than 2 elements.
1687 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1689 }
1690
1691 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1692 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1693 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1694 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1695
1697 }
1698
1699 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1700 // Only required for llvm.aarch64.mops.memset.tag
1702 }
1703
1705
1706 if (Subtarget->hasSVE()) {
1711 }
1712
1713 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1714
1715 IsStrictFPEnabled = true;
1717
1718 if (Subtarget->isWindowsArm64EC()) {
1719 // FIXME: are there other intrinsics we need to add here?
1720 setLibcallName(RTLIB::MEMCPY, "#memcpy");
1721 setLibcallName(RTLIB::MEMSET, "#memset");
1722 setLibcallName(RTLIB::MEMMOVE, "#memmove");
1723 setLibcallName(RTLIB::REM_F32, "#fmodf");
1724 setLibcallName(RTLIB::REM_F64, "#fmod");
1725 setLibcallName(RTLIB::FMA_F32, "#fmaf");
1726 setLibcallName(RTLIB::FMA_F64, "#fma");
1727 setLibcallName(RTLIB::SQRT_F32, "#sqrtf");
1728 setLibcallName(RTLIB::SQRT_F64, "#sqrt");
1729 setLibcallName(RTLIB::CBRT_F32, "#cbrtf");
1730 setLibcallName(RTLIB::CBRT_F64, "#cbrt");
1731 setLibcallName(RTLIB::LOG_F32, "#logf");
1732 setLibcallName(RTLIB::LOG_F64, "#log");
1733 setLibcallName(RTLIB::LOG2_F32, "#log2f");
1734 setLibcallName(RTLIB::LOG2_F64, "#log2");
1735 setLibcallName(RTLIB::LOG10_F32, "#log10f");
1736 setLibcallName(RTLIB::LOG10_F64, "#log10");
1737 setLibcallName(RTLIB::EXP_F32, "#expf");
1738 setLibcallName(RTLIB::EXP_F64, "#exp");
1739 setLibcallName(RTLIB::EXP2_F32, "#exp2f");
1740 setLibcallName(RTLIB::EXP2_F64, "#exp2");
1741 setLibcallName(RTLIB::EXP10_F32, "#exp10f");
1742 setLibcallName(RTLIB::EXP10_F64, "#exp10");
1743 setLibcallName(RTLIB::SIN_F32, "#sinf");
1744 setLibcallName(RTLIB::SIN_F64, "#sin");
1745 setLibcallName(RTLIB::COS_F32, "#cosf");
1746 setLibcallName(RTLIB::COS_F64, "#cos");
1747 setLibcallName(RTLIB::POW_F32, "#powf");
1748 setLibcallName(RTLIB::POW_F64, "#pow");
1749 setLibcallName(RTLIB::LDEXP_F32, "#ldexpf");
1750 setLibcallName(RTLIB::LDEXP_F64, "#ldexp");
1751 setLibcallName(RTLIB::FREXP_F32, "#frexpf");
1752 setLibcallName(RTLIB::FREXP_F64, "#frexp");
1753 }
1754}
1755
1756void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1757 assert(VT.isVector() && "VT should be a vector type");
1758
1759 if (VT.isFloatingPoint()) {
1761 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1762 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1763 }
1764
1765 // Mark vector float intrinsics as expand.
1766 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1776 }
1777
1778 // But we do support custom-lowering for FCOPYSIGN.
1779 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1780 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1781 VT == MVT::v8f16) &&
1782 Subtarget->hasFullFP16()))
1784
1797
1801 for (MVT InnerVT : MVT::all_valuetypes())
1802 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1803
1804 // CNT supports only B element sizes, then use UADDLP to widen.
1805 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1807
1813
1814 for (unsigned Opcode :
1817 setOperationAction(Opcode, VT, Custom);
1818
1819 if (!VT.isFloatingPoint())
1821
1822 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1823 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1824 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1825 setOperationAction(Opcode, VT, Legal);
1826
1827 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1828 // NEON types.
1829 if (VT.isFloatingPoint() &&
1830 VT.getVectorElementType() != MVT::bf16 &&
1831 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1832 for (unsigned Opcode :
1838 setOperationAction(Opcode, VT, Legal);
1839
1840 // Strict fp extend and trunc are legal
1841 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1843 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1845
1846 // FIXME: We could potentially make use of the vector comparison instructions
1847 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1848 // complications:
1849 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1850 // so we would need to expand when the condition code doesn't match the
1851 // kind of comparison.
1852 // * Some kinds of comparison require more than one FCMXY instruction so
1853 // would need to be expanded instead.
1854 // * The lowering of the non-strict versions involves target-specific ISD
1855 // nodes so we would likely need to add strict versions of all of them and
1856 // handle them appropriately.
1859
1860 if (Subtarget->isLittleEndian()) {
1861 for (unsigned im = (unsigned)ISD::PRE_INC;
1865 }
1866 }
1867
1868 if (Subtarget->hasD128()) {
1871 }
1872}
1873
1875 EVT OpVT) const {
1876 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1877 if (!Subtarget->hasSVE())
1878 return true;
1879
1880 // We can only support legal predicate result types. We can use the SVE
1881 // whilelo instruction for generating fixed-width predicates too.
1882 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1883 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1884 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1885 return true;
1886
1887 // The whilelo instruction only works with i32 or i64 scalar inputs.
1888 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1889 return true;
1890
1891 return false;
1892}
1893
1895 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1896}
1897
1898void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
1899 bool StreamingSVE) {
1900 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1901
1902 // By default everything must be expanded.
1903 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1905
1906 if (VT.isFloatingPoint()) {
1916 }
1917
1918 // Mark integer truncating stores/extending loads as having custom lowering
1919 if (VT.isInteger()) {
1920 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1921 while (InnerVT != VT) {
1922 setTruncStoreAction(VT, InnerVT, Custom);
1923 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1924 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1925 InnerVT = InnerVT.changeVectorElementType(
1926 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1927 }
1928 }
1929
1930 // Mark floating-point truncating stores/extending loads as having custom
1931 // lowering
1932 if (VT.isFloatingPoint()) {
1933 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1934 while (InnerVT != VT) {
1935 setTruncStoreAction(VT, InnerVT, Custom);
1936 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1937 InnerVT = InnerVT.changeVectorElementType(
1939 }
1940 }
1941
1942 // Lower fixed length vector operations to scalable equivalents.
1947 setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom);
1982 setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom);
1983 setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom);
1985 setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom);
2004 setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom);
2020 StreamingSVE ? Expand : Custom);
2031}
2032
2033void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
2034 addRegisterClass(VT, &AArch64::FPR64RegClass);
2035 addTypeForNEON(VT);
2036}
2037
2038void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2039 addRegisterClass(VT, &AArch64::FPR128RegClass);
2040 addTypeForNEON(VT);
2041}
2042
2044 LLVMContext &C, EVT VT) const {
2045 if (!VT.isVector())
2046 return MVT::i32;
2047 if (VT.isScalableVector())
2048 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2050}
2051
2052// isIntImmediate - This method tests to see if the node is a constant
2053// operand. If so Imm will receive the value.
2054static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2055 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2056 Imm = C->getZExtValue();
2057 return true;
2058 }
2059 return false;
2060}
2061
2062// isOpcWithIntImmediate - This method tests to see if the node is a specific
2063// opcode and that it has a immediate integer right operand.
2064// If so Imm will receive the value.
2065static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2066 uint64_t &Imm) {
2067 return N->getOpcode() == Opc &&
2068 isIntImmediate(N->getOperand(1).getNode(), Imm);
2069}
2070
2071static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2072 const APInt &Demanded,
2074 unsigned NewOpc) {
2075 uint64_t OldImm = Imm, NewImm, Enc;
2076 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2077
2078 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2079 // bimm64.
2080 if (Imm == 0 || Imm == Mask ||
2082 return false;
2083
2084 unsigned EltSize = Size;
2085 uint64_t DemandedBits = Demanded.getZExtValue();
2086
2087 // Clear bits that are not demanded.
2088 Imm &= DemandedBits;
2089
2090 while (true) {
2091 // The goal here is to set the non-demanded bits in a way that minimizes
2092 // the number of switching between 0 and 1. In order to achieve this goal,
2093 // we set the non-demanded bits to the value of the preceding demanded bits.
2094 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2095 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2096 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2097 // The final result is 0b11000011.
2098 uint64_t NonDemandedBits = ~DemandedBits;
2099 uint64_t InvertedImm = ~Imm & DemandedBits;
2100 uint64_t RotatedImm =
2101 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2102 NonDemandedBits;
2103 uint64_t Sum = RotatedImm + NonDemandedBits;
2104 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2105 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2106 NewImm = (Imm | Ones) & Mask;
2107
2108 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2109 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2110 // we halve the element size and continue the search.
2111 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2112 break;
2113
2114 // We cannot shrink the element size any further if it is 2-bits.
2115 if (EltSize == 2)
2116 return false;
2117
2118 EltSize /= 2;
2119 Mask >>= EltSize;
2120 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2121
2122 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2123 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2124 return false;
2125
2126 // Merge the upper and lower halves of Imm and DemandedBits.
2127 Imm |= Hi;
2128 DemandedBits |= DemandedBitsHi;
2129 }
2130
2131 ++NumOptimizedImms;
2132
2133 // Replicate the element across the register width.
2134 while (EltSize < Size) {
2135 NewImm |= NewImm << EltSize;
2136 EltSize *= 2;
2137 }
2138
2139 (void)OldImm;
2140 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2141 "demanded bits should never be altered");
2142 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2143
2144 // Create the new constant immediate node.
2145 EVT VT = Op.getValueType();
2146 SDLoc DL(Op);
2147 SDValue New;
2148
2149 // If the new constant immediate is all-zeros or all-ones, let the target
2150 // independent DAG combine optimize this node.
2151 if (NewImm == 0 || NewImm == OrigMask) {
2152 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2153 TLO.DAG.getConstant(NewImm, DL, VT));
2154 // Otherwise, create a machine node so that target independent DAG combine
2155 // doesn't undo this optimization.
2156 } else {
2158 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2159 New = SDValue(
2160 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2161 }
2162
2163 return TLO.CombineTo(Op, New);
2164}
2165
2167 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2168 TargetLoweringOpt &TLO) const {
2169 // Delay this optimization to as late as possible.
2170 if (!TLO.LegalOps)
2171 return false;
2172
2174 return false;
2175
2176 EVT VT = Op.getValueType();
2177 if (VT.isVector())
2178 return false;
2179
2180 unsigned Size = VT.getSizeInBits();
2181 assert((Size == 32 || Size == 64) &&
2182 "i32 or i64 is expected after legalization.");
2183
2184 // Exit early if we demand all bits.
2185 if (DemandedBits.popcount() == Size)
2186 return false;
2187
2188 unsigned NewOpc;
2189 switch (Op.getOpcode()) {
2190 default:
2191 return false;
2192 case ISD::AND:
2193 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2194 break;
2195 case ISD::OR:
2196 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2197 break;
2198 case ISD::XOR:
2199 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2200 break;
2201 }
2202 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2203 if (!C)
2204 return false;
2205 uint64_t Imm = C->getZExtValue();
2206 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2207}
2208
2209/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2210/// Mask are known to be either zero or one and return them Known.
2212 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2213 const SelectionDAG &DAG, unsigned Depth) const {
2214 switch (Op.getOpcode()) {
2215 default:
2216 break;
2217 case AArch64ISD::DUP: {
2218 SDValue SrcOp = Op.getOperand(0);
2219 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2220 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2221 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2222 "Expected DUP implicit truncation");
2223 Known = Known.trunc(Op.getScalarValueSizeInBits());
2224 }
2225 break;
2226 }
2227 case AArch64ISD::CSEL: {
2228 KnownBits Known2;
2229 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2230 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2231 Known = Known.intersectWith(Known2);
2232 break;
2233 }
2234 case AArch64ISD::BICi: {
2235 // Compute the bit cleared value.
2236 uint64_t Mask =
2237 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2238 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2239 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2240 break;
2241 }
2242 case AArch64ISD::VLSHR: {
2243 KnownBits Known2;
2244 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2245 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2246 Known = KnownBits::lshr(Known, Known2);
2247 break;
2248 }
2249 case AArch64ISD::VASHR: {
2250 KnownBits Known2;
2251 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2252 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2253 Known = KnownBits::ashr(Known, Known2);
2254 break;
2255 }
2256 case AArch64ISD::VSHL: {
2257 KnownBits Known2;
2258 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2259 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2260 Known = KnownBits::shl(Known, Known2);
2261 break;
2262 }
2263 case AArch64ISD::MOVI: {
2265 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2266 break;
2267 }
2269 case AArch64ISD::ADDlow: {
2270 if (!Subtarget->isTargetILP32())
2271 break;
2272 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2273 Known.Zero = APInt::getHighBitsSet(64, 32);
2274 break;
2275 }
2277 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2278 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2279 break;
2280 }
2282 Intrinsic::ID IntID =
2283 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2284 switch (IntID) {
2285 default: return;
2286 case Intrinsic::aarch64_ldaxr:
2287 case Intrinsic::aarch64_ldxr: {
2288 unsigned BitWidth = Known.getBitWidth();
2289 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2290 unsigned MemBits = VT.getScalarSizeInBits();
2291 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2292 return;
2293 }
2294 }
2295 break;
2296 }
2298 case ISD::INTRINSIC_VOID: {
2299 unsigned IntNo = Op.getConstantOperandVal(0);
2300 switch (IntNo) {
2301 default:
2302 break;
2303 case Intrinsic::aarch64_neon_uaddlv: {
2304 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2305 unsigned BitWidth = Known.getBitWidth();
2306 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2307 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2308 assert(BitWidth >= Bound && "Unexpected width!");
2310 Known.Zero |= Mask;
2311 }
2312 break;
2313 }
2314 case Intrinsic::aarch64_neon_umaxv:
2315 case Intrinsic::aarch64_neon_uminv: {
2316 // Figure out the datatype of the vector operand. The UMINV instruction
2317 // will zero extend the result, so we can mark as known zero all the
2318 // bits larger than the element datatype. 32-bit or larget doesn't need
2319 // this as those are legal types and will be handled by isel directly.
2320 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2321 unsigned BitWidth = Known.getBitWidth();
2322 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2323 assert(BitWidth >= 8 && "Unexpected width!");
2325 Known.Zero |= Mask;
2326 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2327 assert(BitWidth >= 16 && "Unexpected width!");
2329 Known.Zero |= Mask;
2330 }
2331 break;
2332 } break;
2333 }
2334 }
2335 }
2336}
2337
2339 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2340 unsigned Depth) const {
2341 EVT VT = Op.getValueType();
2342 unsigned VTBits = VT.getScalarSizeInBits();
2343 unsigned Opcode = Op.getOpcode();
2344 switch (Opcode) {
2345 case AArch64ISD::CMEQ:
2346 case AArch64ISD::CMGE:
2347 case AArch64ISD::CMGT:
2348 case AArch64ISD::CMHI:
2349 case AArch64ISD::CMHS:
2350 case AArch64ISD::FCMEQ:
2351 case AArch64ISD::FCMGE:
2352 case AArch64ISD::FCMGT:
2353 case AArch64ISD::CMEQz:
2354 case AArch64ISD::CMGEz:
2355 case AArch64ISD::CMGTz:
2356 case AArch64ISD::CMLEz:
2357 case AArch64ISD::CMLTz:
2358 case AArch64ISD::FCMEQz:
2359 case AArch64ISD::FCMGEz:
2360 case AArch64ISD::FCMGTz:
2361 case AArch64ISD::FCMLEz:
2362 case AArch64ISD::FCMLTz:
2363 // Compares return either 0 or all-ones
2364 return VTBits;
2365 }
2366
2367 return 1;
2368}
2369
2371 EVT) const {
2372 return MVT::i64;
2373}
2374
2376 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2377 unsigned *Fast) const {
2378 if (Subtarget->requiresStrictAlign())
2379 return false;
2380
2381 if (Fast) {
2382 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2383 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2384 // See comments in performSTORECombine() for more details about
2385 // these conditions.
2386
2387 // Code that uses clang vector extensions can mark that it
2388 // wants unaligned accesses to be treated as fast by
2389 // underspecifying alignment to be 1 or 2.
2390 Alignment <= 2 ||
2391
2392 // Disregard v2i64. Memcpy lowering produces those and splitting
2393 // them regresses performance on micro-benchmarks and olden/bh.
2394 VT == MVT::v2i64;
2395 }
2396 return true;
2397}
2398
2399// Same as above but handling LLTs instead.
2401 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2402 unsigned *Fast) const {
2403 if (Subtarget->requiresStrictAlign())
2404 return false;
2405
2406 if (Fast) {
2407 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2408 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2409 Ty.getSizeInBytes() != 16 ||
2410 // See comments in performSTORECombine() for more details about
2411 // these conditions.
2412
2413 // Code that uses clang vector extensions can mark that it
2414 // wants unaligned accesses to be treated as fast by
2415 // underspecifying alignment to be 1 or 2.
2416 Alignment <= 2 ||
2417
2418 // Disregard v2i64. Memcpy lowering produces those and splitting
2419 // them regresses performance on micro-benchmarks and olden/bh.
2420 Ty == LLT::fixed_vector(2, 64);
2421 }
2422 return true;
2423}
2424
2425FastISel *
2427 const TargetLibraryInfo *libInfo) const {
2428 return AArch64::createFastISel(funcInfo, libInfo);
2429}
2430
2431const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2432#define MAKE_CASE(V) \
2433 case V: \
2434 return #V;
2435 switch ((AArch64ISD::NodeType)Opcode) {
2437 break;
2754 }
2755#undef MAKE_CASE
2756 return nullptr;
2757}
2758
2761 MachineBasicBlock *MBB) const {
2762 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2763 // phi node:
2764
2765 // OrigBB:
2766 // [... previous instrs leading to comparison ...]
2767 // b.ne TrueBB
2768 // b EndBB
2769 // TrueBB:
2770 // ; Fallthrough
2771 // EndBB:
2772 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2773
2774 MachineFunction *MF = MBB->getParent();
2775 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2776 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2777 DebugLoc DL = MI.getDebugLoc();
2779
2780 Register DestReg = MI.getOperand(0).getReg();
2781 Register IfTrueReg = MI.getOperand(1).getReg();
2782 Register IfFalseReg = MI.getOperand(2).getReg();
2783 unsigned CondCode = MI.getOperand(3).getImm();
2784 bool NZCVKilled = MI.getOperand(4).isKill();
2785
2786 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2787 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2788 MF->insert(It, TrueBB);
2789 MF->insert(It, EndBB);
2790
2791 // Transfer rest of current basic-block to EndBB
2792 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2793 MBB->end());
2795
2796 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2797 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2798 MBB->addSuccessor(TrueBB);
2799 MBB->addSuccessor(EndBB);
2800
2801 // TrueBB falls through to the end.
2802 TrueBB->addSuccessor(EndBB);
2803
2804 if (!NZCVKilled) {
2805 TrueBB->addLiveIn(AArch64::NZCV);
2806 EndBB->addLiveIn(AArch64::NZCV);
2807 }
2808
2809 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2810 .addReg(IfTrueReg)
2811 .addMBB(TrueBB)
2812 .addReg(IfFalseReg)
2813 .addMBB(MBB);
2814
2815 MI.eraseFromParent();
2816 return EndBB;
2817}
2818
2820 MachineInstr &MI, MachineBasicBlock *BB) const {
2822 BB->getParent()->getFunction().getPersonalityFn())) &&
2823 "SEH does not use catchret!");
2824 return BB;
2825}
2826
2829 MachineBasicBlock *MBB) const {
2830 MachineFunction &MF = *MBB->getParent();
2831 MachineBasicBlock::iterator MBBI = MI.getIterator();
2833 const AArch64InstrInfo &TII =
2834 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2835 Register TargetReg = MI.getOperand(0).getReg();
2837 TII.probedStackAlloc(MBBI, TargetReg, false);
2838
2839 MI.eraseFromParent();
2840 return NextInst->getParent();
2841}
2842
2844AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2846 MachineBasicBlock *BB) const {
2847 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2848 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2849
2850 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2851 MIB.add(MI.getOperand(1)); // slice index register
2852 MIB.add(MI.getOperand(2)); // slice index offset
2853 MIB.add(MI.getOperand(3)); // pg
2854 MIB.add(MI.getOperand(4)); // base
2855 MIB.add(MI.getOperand(5)); // offset
2856
2857 MI.eraseFromParent(); // The pseudo is gone now.
2858 return BB;
2859}
2860
2863 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2865 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2866
2867 MIB.addReg(AArch64::ZA, RegState::Define);
2868 MIB.add(MI.getOperand(0)); // Vector select register
2869 MIB.add(MI.getOperand(1)); // Vector select offset
2870 MIB.add(MI.getOperand(2)); // Base
2871 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2872
2873 MI.eraseFromParent(); // The pseudo is gone now.
2874 return BB;
2875}
2876
2879 unsigned Opcode,
2880 bool Op0IsDef) const {
2881 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2883
2884 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2885 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2886 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2887 MIB.add(MI.getOperand(I));
2888
2889 MI.eraseFromParent(); // The pseudo is gone now.
2890 return BB;
2891}
2892
2894AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2896 MachineBasicBlock *BB, bool HasTile) const {
2897 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2898 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2899 unsigned StartIdx = 0;
2900
2901 if (HasTile) {
2902 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2903 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2904 StartIdx = 1;
2905 } else
2906 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2907
2908 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2909 MIB.add(MI.getOperand(I));
2910
2911 MI.eraseFromParent(); // The pseudo is gone now.
2912 return BB;
2913}
2914
2917 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2919 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2920 MIB.add(MI.getOperand(0)); // Mask
2921
2922 unsigned Mask = MI.getOperand(0).getImm();
2923 for (unsigned I = 0; I < 8; I++) {
2924 if (Mask & (1 << I))
2925 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2926 }
2927
2928 MI.eraseFromParent(); // The pseudo is gone now.
2929 return BB;
2930}
2931
2933 MachineInstr &MI, MachineBasicBlock *BB) const {
2934
2935 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2936 if (SMEOrigInstr != -1) {
2937 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2938 uint64_t SMEMatrixType =
2939 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2940 switch (SMEMatrixType) {
2942 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2944 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2946 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2948 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2950 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2952 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2953 }
2954 }
2955
2956 switch (MI.getOpcode()) {
2957 default:
2958#ifndef NDEBUG
2959 MI.dump();
2960#endif
2961 llvm_unreachable("Unexpected instruction for custom inserter!");
2962
2963 case AArch64::F128CSEL:
2964 return EmitF128CSEL(MI, BB);
2965 case TargetOpcode::STATEPOINT:
2966 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2967 // while bl call instruction (where statepoint will be lowered at the end)
2968 // has implicit def. This def is early-clobber as it will be set at
2969 // the moment of the call and earlier than any use is read.
2970 // Add this implicit dead def here as a workaround.
2971 MI.addOperand(*MI.getMF(),
2973 AArch64::LR, /*isDef*/ true,
2974 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2975 /*isUndef*/ false, /*isEarlyClobber*/ true));
2976 [[fallthrough]];
2977 case TargetOpcode::STACKMAP:
2978 case TargetOpcode::PATCHPOINT:
2979 return emitPatchPoint(MI, BB);
2980
2981 case TargetOpcode::PATCHABLE_EVENT_CALL:
2982 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2983 return BB;
2984
2985 case AArch64::CATCHRET:
2986 return EmitLoweredCatchRet(MI, BB);
2987
2988 case AArch64::PROBED_STACKALLOC_DYN:
2989 return EmitDynamicProbedAlloc(MI, BB);
2990
2991 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2992 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2993 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2994 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2995 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2996 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2997 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2998 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2999 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3000 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3001 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3002 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3003 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3004 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3005 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3006 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3007 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3008 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3009 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3010 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3011 case AArch64::LDR_ZA_PSEUDO:
3012 return EmitFill(MI, BB);
3013 case AArch64::LDR_TX_PSEUDO:
3014 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3015 case AArch64::STR_TX_PSEUDO:
3016 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3017 case AArch64::ZERO_M_PSEUDO:
3018 return EmitZero(MI, BB);
3019 case AArch64::ZERO_T_PSEUDO:
3020 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3021 }
3022}
3023
3024//===----------------------------------------------------------------------===//
3025// AArch64 Lowering private implementation.
3026//===----------------------------------------------------------------------===//
3027
3028//===----------------------------------------------------------------------===//
3029// Lowering Code
3030//===----------------------------------------------------------------------===//
3031
3032// Forward declarations of SVE fixed length lowering helpers
3037 SelectionDAG &DAG);
3040 EVT VT);
3041
3042/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3043static bool isZerosVector(const SDNode *N) {
3044 // Look through a bit convert.
3045 while (N->getOpcode() == ISD::BITCAST)
3046 N = N->getOperand(0).getNode();
3047
3049 return true;
3050
3051 if (N->getOpcode() != AArch64ISD::DUP)
3052 return false;
3053
3054 auto Opnd0 = N->getOperand(0);
3055 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3056}
3057
3058/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3059/// CC
3061 switch (CC) {
3062 default:
3063 llvm_unreachable("Unknown condition code!");
3064 case ISD::SETNE:
3065 return AArch64CC::NE;
3066 case ISD::SETEQ:
3067 return AArch64CC::EQ;
3068 case ISD::SETGT:
3069 return AArch64CC::GT;
3070 case ISD::SETGE:
3071 return AArch64CC::GE;
3072 case ISD::SETLT:
3073 return AArch64CC::LT;
3074 case ISD::SETLE:
3075 return AArch64CC::LE;
3076 case ISD::SETUGT:
3077 return AArch64CC::HI;
3078 case ISD::SETUGE:
3079 return AArch64CC::HS;
3080 case ISD::SETULT:
3081 return AArch64CC::LO;
3082 case ISD::SETULE:
3083 return AArch64CC::LS;
3084 }
3085}
3086
3087/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3089 AArch64CC::CondCode &CondCode,
3090 AArch64CC::CondCode &CondCode2) {
3091 CondCode2 = AArch64CC::AL;
3092 switch (CC) {
3093 default:
3094 llvm_unreachable("Unknown FP condition!");
3095 case ISD::SETEQ:
3096 case ISD::SETOEQ:
3097 CondCode = AArch64CC::EQ;
3098 break;
3099 case ISD::SETGT:
3100 case ISD::SETOGT:
3101 CondCode = AArch64CC::GT;
3102 break;
3103 case ISD::SETGE:
3104 case ISD::SETOGE:
3105 CondCode = AArch64CC::GE;
3106 break;
3107 case ISD::SETOLT:
3108 CondCode = AArch64CC::MI;
3109 break;
3110 case ISD::SETOLE:
3111 CondCode = AArch64CC::LS;
3112 break;
3113 case ISD::SETONE:
3114 CondCode = AArch64CC::MI;
3115 CondCode2 = AArch64CC::GT;
3116 break;
3117 case ISD::SETO:
3118 CondCode = AArch64CC::VC;
3119 break;
3120 case ISD::SETUO:
3121 CondCode = AArch64CC::VS;
3122 break;
3123 case ISD::SETUEQ:
3124 CondCode = AArch64CC::EQ;
3125 CondCode2 = AArch64CC::VS;
3126 break;
3127 case ISD::SETUGT:
3128 CondCode = AArch64CC::HI;
3129 break;
3130 case ISD::SETUGE:
3131 CondCode = AArch64CC::PL;
3132 break;
3133 case ISD::SETLT:
3134 case ISD::SETULT:
3135 CondCode = AArch64CC::LT;
3136 break;
3137 case ISD::SETLE:
3138 case ISD::SETULE:
3139 CondCode = AArch64CC::LE;
3140 break;
3141 case ISD::SETNE:
3142 case ISD::SETUNE:
3143 CondCode = AArch64CC::NE;
3144 break;
3145 }
3146}
3147
3148/// Convert a DAG fp condition code to an AArch64 CC.
3149/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3150/// should be AND'ed instead of OR'ed.
3152 AArch64CC::CondCode &CondCode,
3153 AArch64CC::CondCode &CondCode2) {
3154 CondCode2 = AArch64CC::AL;
3155 switch (CC) {
3156 default:
3157 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3158 assert(CondCode2 == AArch64CC::AL);
3159 break;
3160 case ISD::SETONE:
3161 // (a one b)
3162 // == ((a olt b) || (a ogt b))
3163 // == ((a ord b) && (a une b))
3164 CondCode = AArch64CC::VC;
3165 CondCode2 = AArch64CC::NE;
3166 break;
3167 case ISD::SETUEQ:
3168 // (a ueq b)
3169 // == ((a uno b) || (a oeq b))
3170 // == ((a ule b) && (a uge b))
3171 CondCode = AArch64CC::PL;
3172 CondCode2 = AArch64CC::LE;
3173 break;
3174 }
3175}
3176
3177/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3178/// CC usable with the vector instructions. Fewer operations are available
3179/// without a real NZCV register, so we have to use less efficient combinations
3180/// to get the same effect.
3182 AArch64CC::CondCode &CondCode,
3183 AArch64CC::CondCode &CondCode2,
3184 bool &Invert) {
3185 Invert = false;
3186 switch (CC) {
3187 default:
3188 // Mostly the scalar mappings work fine.
3189 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3190 break;
3191 case ISD::SETUO:
3192 Invert = true;
3193 [[fallthrough]];
3194 case ISD::SETO:
3195 CondCode = AArch64CC::MI;
3196 CondCode2 = AArch64CC::GE;
3197 break;
3198 case ISD::SETUEQ:
3199 case ISD::SETULT:
3200 case ISD::SETULE:
3201 case ISD::SETUGT:
3202 case ISD::SETUGE:
3203 // All of the compare-mask comparisons are ordered, but we can switch
3204 // between the two by a double inversion. E.g. ULE == !OGT.
3205 Invert = true;
3206 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3207 CondCode, CondCode2);
3208 break;
3209 }
3210}
3211
3213 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3214 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3215 LLVM_DEBUG(dbgs() << "Is imm " << C
3216 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3217 return IsLegal;
3218}
3219
3220// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3221// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3222// can be set differently by this operation. It comes down to whether
3223// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3224// everything is fine. If not then the optimization is wrong. Thus general
3225// comparisons are only valid if op2 != 0.
3226//
3227// So, finally, the only LLVM-native comparisons that don't mention C and V
3228// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3229// the absence of information about op2.
3231 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3232 (CC == ISD::SETEQ || CC == ISD::SETNE);
3233}
3234
3236 SelectionDAG &DAG, SDValue Chain,
3237 bool IsSignaling) {
3238 EVT VT = LHS.getValueType();
3239 assert(VT != MVT::f128);
3240
3241 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3242
3243 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3244 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3245 {Chain, LHS});
3246 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3247 {LHS.getValue(1), RHS});
3248 Chain = RHS.getValue(1);
3249 VT = MVT::f32;
3250 }
3251 unsigned Opcode =
3253 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3254}
3255
3257 const SDLoc &dl, SelectionDAG &DAG) {
3258 EVT VT = LHS.getValueType();
3259 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3260
3261 if (VT.isFloatingPoint()) {
3262 assert(VT != MVT::f128);
3263 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3264 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3265 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3266 VT = MVT::f32;
3267 }
3268 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3269 }
3270
3271 // The CMP instruction is just an alias for SUBS, and representing it as
3272 // SUBS means that it's possible to get CSE with subtract operations.
3273 // A later phase can perform the optimization of setting the destination
3274 // register to WZR/XZR if it ends up being unused.
3275 unsigned Opcode = AArch64ISD::SUBS;
3276
3277 if (isCMN(RHS, CC)) {
3278 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3279 Opcode = AArch64ISD::ADDS;
3280 RHS = RHS.getOperand(1);
3281 } else if (isCMN(LHS, CC)) {
3282 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3283 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3284 Opcode = AArch64ISD::ADDS;
3285 LHS = LHS.getOperand(1);
3286 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3287 if (LHS.getOpcode() == ISD::AND) {
3288 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3289 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3290 // of the signed comparisons.
3291 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3292 DAG.getVTList(VT, MVT_CC),
3293 LHS.getOperand(0),
3294 LHS.getOperand(1));
3295 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3296 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3297 return ANDSNode.getValue(1);
3298 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3299 // Use result of ANDS
3300 return LHS.getValue(1);
3301 }
3302 }
3303
3304 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3305 .getValue(1);
3306}
3307
3308/// \defgroup AArch64CCMP CMP;CCMP matching
3309///
3310/// These functions deal with the formation of CMP;CCMP;... sequences.
3311/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3312/// a comparison. They set the NZCV flags to a predefined value if their
3313/// predicate is false. This allows to express arbitrary conjunctions, for
3314/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3315/// expressed as:
3316/// cmp A
3317/// ccmp B, inv(CB), CA
3318/// check for CB flags
3319///
3320/// This naturally lets us implement chains of AND operations with SETCC
3321/// operands. And we can even implement some other situations by transforming
3322/// them:
3323/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3324/// negating the flags used in a CCMP/FCCMP operations.
3325/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3326/// by negating the flags we test for afterwards. i.e.
3327/// NEG (CMP CCMP CCCMP ...) can be implemented.
3328/// - Note that we can only ever negate all previously processed results.
3329/// What we can not implement by flipping the flags to test is a negation
3330/// of two sub-trees (because the negation affects all sub-trees emitted so
3331/// far, so the 2nd sub-tree we emit would also affect the first).
3332/// With those tools we can implement some OR operations:
3333/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3334/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3335/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3336/// elimination rules from earlier to implement the whole thing as a
3337/// CCMP/FCCMP chain.
3338///
3339/// As complete example:
3340/// or (or (setCA (cmp A)) (setCB (cmp B)))
3341/// (and (setCC (cmp C)) (setCD (cmp D)))"
3342/// can be reassociated to:
3343/// or (and (setCC (cmp C)) setCD (cmp D))
3344// (or (setCA (cmp A)) (setCB (cmp B)))
3345/// can be transformed to:
3346/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3347/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3348/// which can be implemented as:
3349/// cmp C
3350/// ccmp D, inv(CD), CC
3351/// ccmp A, CA, inv(CD)
3352/// ccmp B, CB, inv(CA)
3353/// check for CB flags
3354///
3355/// A counterexample is "or (and A B) (and C D)" which translates to
3356/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3357/// can only implement 1 of the inner (not) operations, but not both!
3358/// @{
3359
3360/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3362 ISD::CondCode CC, SDValue CCOp,
3363 AArch64CC::CondCode Predicate,
3364 AArch64CC::CondCode OutCC,
3365 const SDLoc &DL, SelectionDAG &DAG) {
3366 unsigned Opcode = 0;
3367 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3368
3369 if (LHS.getValueType().isFloatingPoint()) {
3370 assert(LHS.getValueType() != MVT::f128);
3371 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3372 LHS.getValueType() == MVT::bf16) {
3373 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3374 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3375 }
3376 Opcode = AArch64ISD::FCCMP;
3377 } else if (RHS.getOpcode() == ISD::SUB) {
3378 SDValue SubOp0 = RHS.getOperand(0);
3379 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3380 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3381 Opcode = AArch64ISD::CCMN;
3382 RHS = RHS.getOperand(1);
3383 }
3384 }
3385 if (Opcode == 0)
3386 Opcode = AArch64ISD::CCMP;
3387
3388 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3390 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3391 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3392 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3393}
3394
3395/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3396/// expressed as a conjunction. See \ref AArch64CCMP.
3397/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3398/// changing the conditions on the SETCC tests.
3399/// (this means we can call emitConjunctionRec() with
3400/// Negate==true on this sub-tree)
3401/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3402/// cannot do the negation naturally. We are required to
3403/// emit the subtree first in this case.
3404/// \param WillNegate Is true if are called when the result of this
3405/// subexpression must be negated. This happens when the
3406/// outer expression is an OR. We can use this fact to know
3407/// that we have a double negation (or (or ...) ...) that
3408/// can be implemented for free.
3409static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3410 bool &MustBeFirst, bool WillNegate,
3411 unsigned Depth = 0) {
3412 if (!Val.hasOneUse())
3413 return false;
3414 unsigned Opcode = Val->getOpcode();
3415 if (Opcode == ISD::SETCC) {
3416 if (Val->getOperand(0).getValueType() == MVT::f128)
3417 return false;
3418 CanNegate = true;
3419 MustBeFirst = false;
3420 return true;
3421 }
3422 // Protect against exponential runtime and stack overflow.
3423 if (Depth > 6)
3424 return false;
3425 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3426 bool IsOR = Opcode == ISD::OR;
3427 SDValue O0 = Val->getOperand(0);
3428 SDValue O1 = Val->getOperand(1);
3429 bool CanNegateL;
3430 bool MustBeFirstL;
3431 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3432 return false;
3433 bool CanNegateR;
3434 bool MustBeFirstR;
3435 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3436 return false;
3437
3438 if (MustBeFirstL && MustBeFirstR)
3439 return false;
3440
3441 if (IsOR) {
3442 // For an OR expression we need to be able to naturally negate at least
3443 // one side or we cannot do the transformation at all.
3444 if (!CanNegateL && !CanNegateR)
3445 return false;
3446 // If we the result of the OR will be negated and we can naturally negate
3447 // the leafs, then this sub-tree as a whole negates naturally.
3448 CanNegate = WillNegate && CanNegateL && CanNegateR;
3449 // If we cannot naturally negate the whole sub-tree, then this must be
3450 // emitted first.
3451 MustBeFirst = !CanNegate;
3452 } else {
3453 assert(Opcode == ISD::AND && "Must be OR or AND");
3454 // We cannot naturally negate an AND operation.
3455 CanNegate = false;
3456 MustBeFirst = MustBeFirstL || MustBeFirstR;
3457 }
3458 return true;
3459 }
3460 return false;
3461}
3462
3463/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3464/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3465/// Tries to transform the given i1 producing node @p Val to a series compare
3466/// and conditional compare operations. @returns an NZCV flags producing node
3467/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3468/// transformation was not possible.
3469/// \p Negate is true if we want this sub-tree being negated just by changing
3470/// SETCC conditions.
3472 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3473 AArch64CC::CondCode Predicate) {
3474 // We're at a tree leaf, produce a conditional comparison operation.
3475 unsigned Opcode = Val->getOpcode();
3476 if (Opcode == ISD::SETCC) {
3477 SDValue LHS = Val->getOperand(0);
3478 SDValue RHS = Val->getOperand(1);
3479 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3480 bool isInteger = LHS.getValueType().isInteger();
3481 if (Negate)
3482 CC = getSetCCInverse(CC, LHS.getValueType());
3483 SDLoc DL(Val);
3484 // Determine OutCC and handle FP special case.
3485 if (isInteger) {
3486 OutCC = changeIntCCToAArch64CC(CC);
3487 } else {
3488 assert(LHS.getValueType().isFloatingPoint());
3489 AArch64CC::CondCode ExtraCC;
3490 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3491 // Some floating point conditions can't be tested with a single condition
3492 // code. Construct an additional comparison in this case.
3493 if (ExtraCC != AArch64CC::AL) {
3494 SDValue ExtraCmp;
3495 if (!CCOp.getNode())
3496 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3497 else
3498 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3499 ExtraCC, DL, DAG);
3500 CCOp = ExtraCmp;
3501 Predicate = ExtraCC;
3502 }
3503 }
3504
3505 // Produce a normal comparison if we are first in the chain
3506 if (!CCOp)
3507 return emitComparison(LHS, RHS, CC, DL, DAG);
3508 // Otherwise produce a ccmp.
3509 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3510 DAG);
3511 }
3512 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3513
3514 bool IsOR = Opcode == ISD::OR;
3515
3516 SDValue LHS = Val->getOperand(0);
3517 bool CanNegateL;
3518 bool MustBeFirstL;
3519 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3520 assert(ValidL && "Valid conjunction/disjunction tree");
3521 (void)ValidL;
3522
3523 SDValue RHS = Val->getOperand(1);
3524 bool CanNegateR;
3525 bool MustBeFirstR;
3526 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3527 assert(ValidR && "Valid conjunction/disjunction tree");
3528 (void)ValidR;
3529
3530 // Swap sub-tree that must come first to the right side.
3531 if (MustBeFirstL) {
3532 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3533 std::swap(LHS, RHS);
3534 std::swap(CanNegateL, CanNegateR);
3535 std::swap(MustBeFirstL, MustBeFirstR);
3536 }
3537
3538 bool NegateR;
3539 bool NegateAfterR;
3540 bool NegateL;
3541 bool NegateAfterAll;
3542 if (Opcode == ISD::OR) {
3543 // Swap the sub-tree that we can negate naturally to the left.
3544 if (!CanNegateL) {
3545 assert(CanNegateR && "at least one side must be negatable");
3546 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3547 assert(!Negate);
3548 std::swap(LHS, RHS);
3549 NegateR = false;
3550 NegateAfterR = true;
3551 } else {
3552 // Negate the left sub-tree if possible, otherwise negate the result.
3553 NegateR = CanNegateR;
3554 NegateAfterR = !CanNegateR;
3555 }
3556 NegateL = true;
3557 NegateAfterAll = !Negate;
3558 } else {
3559 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3560 assert(!Negate && "Valid conjunction/disjunction tree");
3561
3562 NegateL = false;
3563 NegateR = false;
3564 NegateAfterR = false;
3565 NegateAfterAll = false;
3566 }
3567
3568 // Emit sub-trees.
3569 AArch64CC::CondCode RHSCC;
3570 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3571 if (NegateAfterR)
3572 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3573 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3574 if (NegateAfterAll)
3575 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3576 return CmpL;
3577}
3578
3579/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3580/// In some cases this is even possible with OR operations in the expression.
3581/// See \ref AArch64CCMP.
3582/// \see emitConjunctionRec().
3584 AArch64CC::CondCode &OutCC) {
3585 bool DummyCanNegate;
3586 bool DummyMustBeFirst;
3587 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3588 return SDValue();
3589
3590 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3591}
3592
3593/// @}
3594
3595/// Returns how profitable it is to fold a comparison's operand's shift and/or
3596/// extension operations.
3598 auto isSupportedExtend = [&](SDValue V) {
3599 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3600 return true;
3601
3602 if (V.getOpcode() == ISD::AND)
3603 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3604 uint64_t Mask = MaskCst->getZExtValue();
3605 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3606 }
3607
3608 return false;
3609 };
3610
3611 if (!Op.hasOneUse())
3612 return 0;
3613
3614 if (isSupportedExtend(Op))
3615 return 1;
3616
3617 unsigned Opc = Op.getOpcode();
3618 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3619 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3620 uint64_t Shift = ShiftCst->getZExtValue();
3621 if (isSupportedExtend(Op.getOperand(0)))
3622 return (Shift <= 4) ? 2 : 1;
3623 EVT VT = Op.getValueType();
3624 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3625 return 1;
3626 }
3627
3628 return 0;
3629}
3630
3632 SDValue &AArch64cc, SelectionDAG &DAG,
3633 const SDLoc &dl) {
3634 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3635 EVT VT = RHS.getValueType();
3636 uint64_t C = RHSC->getZExtValue();
3637 if (!isLegalArithImmed(C)) {
3638 // Constant does not fit, try adjusting it by one?
3639 switch (CC) {
3640 default:
3641 break;
3642 case ISD::SETLT:
3643 case ISD::SETGE:
3644 if ((VT == MVT::i32 && C != 0x80000000 &&
3645 isLegalArithImmed((uint32_t)(C - 1))) ||
3646 (VT == MVT::i64 && C != 0x80000000ULL &&
3647 isLegalArithImmed(C - 1ULL))) {
3649 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3650 RHS = DAG.getConstant(C, dl, VT);
3651 }
3652 break;
3653 case ISD::SETULT:
3654 case ISD::SETUGE:
3655 if ((VT == MVT::i32 && C != 0 &&
3656 isLegalArithImmed((uint32_t)(C - 1))) ||
3657 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3659 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3660 RHS = DAG.getConstant(C, dl, VT);
3661 }
3662 break;
3663 case ISD::SETLE:
3664 case ISD::SETGT:
3665 if ((VT == MVT::i32 && C != INT32_MAX &&
3666 isLegalArithImmed((uint32_t)(C + 1))) ||
3667 (VT == MVT::i64 && C != INT64_MAX &&
3668 isLegalArithImmed(C + 1ULL))) {
3670 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3671 RHS = DAG.getConstant(C, dl, VT);
3672 }
3673 break;
3674 case ISD::SETULE:
3675 case ISD::SETUGT:
3676 if ((VT == MVT::i32 && C != UINT32_MAX &&
3677 isLegalArithImmed((uint32_t)(C + 1))) ||
3678 (VT == MVT::i64 && C != UINT64_MAX &&
3679 isLegalArithImmed(C + 1ULL))) {
3681 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3682 RHS = DAG.getConstant(C, dl, VT);
3683 }
3684 break;
3685 }
3686 }
3687 }
3688
3689 // Comparisons are canonicalized so that the RHS operand is simpler than the
3690 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3691 // can fold some shift+extend operations on the RHS operand, so swap the
3692 // operands if that can be done.
3693 //
3694 // For example:
3695 // lsl w13, w11, #1
3696 // cmp w13, w12
3697 // can be turned into:
3698 // cmp w12, w11, lsl #1
3699 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3700 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3701
3703 std::swap(LHS, RHS);
3705 }
3706 }
3707
3708 SDValue Cmp;
3709 AArch64CC::CondCode AArch64CC;
3710 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3711 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3712
3713 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3714 // For the i8 operand, the largest immediate is 255, so this can be easily
3715 // encoded in the compare instruction. For the i16 operand, however, the
3716 // largest immediate cannot be encoded in the compare.
3717 // Therefore, use a sign extending load and cmn to avoid materializing the
3718 // -1 constant. For example,
3719 // movz w1, #65535
3720 // ldrh w0, [x0, #0]
3721 // cmp w0, w1
3722 // >
3723 // ldrsh w0, [x0, #0]
3724 // cmn w0, #1
3725 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3726 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3727 // ensure both the LHS and RHS are truly zero extended and to make sure the
3728 // transformation is profitable.
3729 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3730 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3731 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3732 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3733 int16_t ValueofRHS = RHS->getAsZExtVal();
3734 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3735 SDValue SExt =
3736 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3737 DAG.getValueType(MVT::i16));
3738 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3739 RHS.getValueType()),
3740 CC, dl, DAG);
3741 AArch64CC = changeIntCCToAArch64CC(CC);
3742 }
3743 }
3744
3745 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3746 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3747 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3748 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3749 }
3750 }
3751 }
3752
3753 if (!Cmp) {
3754 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3755 AArch64CC = changeIntCCToAArch64CC(CC);
3756 }
3757 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3758 return Cmp;
3759}
3760
3761static std::pair<SDValue, SDValue>
3763 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3764 "Unsupported value type");
3765 SDValue Value, Overflow;
3766 SDLoc DL(Op);
3767 SDValue LHS = Op.getOperand(0);
3768 SDValue RHS = Op.getOperand(1);
3769 unsigned Opc = 0;
3770 switch (Op.getOpcode()) {
3771 default:
3772 llvm_unreachable("Unknown overflow instruction!");
3773 case ISD::SADDO:
3774 Opc = AArch64ISD::ADDS;
3775 CC = AArch64CC::VS;
3776 break;
3777 case ISD::UADDO:
3778 Opc = AArch64ISD::ADDS;
3779 CC = AArch64CC::HS;
3780 break;
3781 case ISD::SSUBO:
3782 Opc = AArch64ISD::SUBS;
3783 CC = AArch64CC::VS;
3784 break;
3785 case ISD::USUBO:
3786 Opc = AArch64ISD::SUBS;
3787 CC = AArch64CC::LO;
3788 break;
3789 // Multiply needs a little bit extra work.
3790 case ISD::SMULO:
3791 case ISD::UMULO: {
3792 CC = AArch64CC::NE;
3793 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3794 if (Op.getValueType() == MVT::i32) {
3795 // Extend to 64-bits, then perform a 64-bit multiply.
3796 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3797 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3798 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3799 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3800 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3801
3802 // Check that the result fits into a 32-bit integer.
3803 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3804 if (IsSigned) {
3805 // cmp xreg, wreg, sxtw
3806 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3807 Overflow =
3808 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3809 } else {
3810 // tst xreg, #0xffffffff00000000
3811 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3812 Overflow =
3813 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3814 }
3815 break;
3816 }
3817 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3818 // For the 64 bit multiply
3819 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3820 if (IsSigned) {
3821 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3822 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3823 DAG.getConstant(63, DL, MVT::i64));
3824 // It is important that LowerBits is last, otherwise the arithmetic
3825 // shift will not be folded into the compare (SUBS).
3826 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3827 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3828 .getValue(1);
3829 } else {
3830 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3831 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3832 Overflow =
3833 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3834 DAG.getConstant(0, DL, MVT::i64),
3835 UpperBits).getValue(1);
3836 }
3837 break;
3838 }
3839 } // switch (...)
3840
3841 if (Opc) {
3842 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3843
3844 // Emit the AArch64 operation with overflow check.
3845 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3846 Overflow = Value.getValue(1);
3847 }
3848 return std::make_pair(Value, Overflow);
3849}
3850
3851SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3852 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3853 !Subtarget->isNeonAvailable()))
3854 return LowerToScalableOp(Op, DAG);
3855
3856 SDValue Sel = Op.getOperand(0);
3857 SDValue Other = Op.getOperand(1);
3858 SDLoc dl(Sel);
3859
3860 // If the operand is an overflow checking operation, invert the condition
3861 // code and kill the Not operation. I.e., transform:
3862 // (xor (overflow_op_bool, 1))
3863 // -->
3864 // (csel 1, 0, invert(cc), overflow_op_bool)
3865 // ... which later gets transformed to just a cset instruction with an
3866 // inverted condition code, rather than a cset + eor sequence.
3868 // Only lower legal XALUO ops.
3870 return SDValue();
3871
3872 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3873 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3875 SDValue Value, Overflow;
3876 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3877 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3878 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3879 CCVal, Overflow);
3880 }
3881 // If neither operand is a SELECT_CC, give up.
3882 if (Sel.getOpcode() != ISD::SELECT_CC)
3883 std::swap(Sel, Other);
3884 if (Sel.getOpcode() != ISD::SELECT_CC)
3885 return Op;
3886
3887 // The folding we want to perform is:
3888 // (xor x, (select_cc a, b, cc, 0, -1) )
3889 // -->
3890 // (csel x, (xor x, -1), cc ...)
3891 //
3892 // The latter will get matched to a CSINV instruction.
3893
3894 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3895 SDValue LHS = Sel.getOperand(0);
3896 SDValue RHS = Sel.getOperand(1);
3897 SDValue TVal = Sel.getOperand(2);
3898 SDValue FVal = Sel.getOperand(3);
3899
3900 // FIXME: This could be generalized to non-integer comparisons.
3901 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3902 return Op;
3903
3904 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3905 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3906
3907 // The values aren't constants, this isn't the pattern we're looking for.
3908 if (!CFVal || !CTVal)
3909 return Op;
3910
3911 // We can commute the SELECT_CC by inverting the condition. This
3912 // might be needed to make this fit into a CSINV pattern.
3913 if (CTVal->isAllOnes() && CFVal->isZero()) {
3914 std::swap(TVal, FVal);
3915 std::swap(CTVal, CFVal);
3916 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3917 }
3918
3919 // If the constants line up, perform the transform!
3920 if (CTVal->isZero() && CFVal->isAllOnes()) {
3921 SDValue CCVal;
3922 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3923
3924 FVal = Other;
3925 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3926 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3927
3928 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3929 CCVal, Cmp);
3930 }
3931
3932 return Op;
3933}
3934
3935// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3936// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3937// sets 'C' bit to 0.
3939 SDLoc DL(Value);
3940 EVT VT = Value.getValueType();
3941 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3942 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3943 SDValue Cmp =
3944 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3945 return Cmp.getValue(1);
3946}
3947
3948// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3949// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3951 bool Invert) {
3952 assert(Glue.getResNo() == 1);
3953 SDLoc DL(Glue);
3954 SDValue Zero = DAG.getConstant(0, DL, VT);
3955 SDValue One = DAG.getConstant(1, DL, VT);
3956 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3957 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3958 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3959}
3960
3961// Value is 1 if 'V' bit of NZCV is 1, else 0
3963 assert(Glue.getResNo() == 1);
3964 SDLoc DL(Glue);
3965 SDValue Zero = DAG.getConstant(0, DL, VT);
3966 SDValue One = DAG.getConstant(1, DL, VT);
3967 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3968 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3969}
3970
3971// This lowering is inefficient, but it will get cleaned up by
3972// `foldOverflowCheck`
3974 unsigned Opcode, bool IsSigned) {
3975 EVT VT0 = Op.getValue(0).getValueType();
3976 EVT VT1 = Op.getValue(1).getValueType();
3977
3978 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3979 return SDValue();
3980
3981 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3982 SDValue OpLHS = Op.getOperand(0);
3983 SDValue OpRHS = Op.getOperand(1);
3984 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3985
3986 SDLoc DL(Op);
3987 SDVTList VTs = DAG.getVTList(VT0, VT1);
3988
3989 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3990 OpRHS, OpCarryIn);
3991
3992 SDValue OutFlag =
3993 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3994 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3995
3996 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3997}
3998
4000 // Let legalize expand this if it isn't a legal type yet.
4001 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4002 return SDValue();
4003
4004 SDLoc dl(Op);
4006 // The actual operation that sets the overflow or carry flag.
4007 SDValue Value, Overflow;
4008 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4009
4010 // We use 0 and 1 as false and true values.
4011 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4012 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4013
4014 // We use an inverted condition, because the conditional select is inverted
4015 // too. This will allow it to be selected to a single instruction:
4016 // CSINC Wd, WZR, WZR, invert(cond).
4017 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4018 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4019 CCVal, Overflow);
4020
4021 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4022 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4023}
4024
4025// Prefetch operands are:
4026// 1: Address to prefetch
4027// 2: bool isWrite
4028// 3: int locality (0 = no locality ... 3 = extreme locality)
4029// 4: bool isDataCache
4031 SDLoc DL(Op);
4032 unsigned IsWrite = Op.getConstantOperandVal(2);
4033 unsigned Locality = Op.getConstantOperandVal(3);
4034 unsigned IsData = Op.getConstantOperandVal(4);
4035
4036 bool IsStream = !Locality;
4037 // When the locality number is set
4038 if (Locality) {
4039 // The front-end should have filtered out the out-of-range values
4040 assert(Locality <= 3 && "Prefetch locality out-of-range");
4041 // The locality degree is the opposite of the cache speed.
4042 // Put the number the other way around.
4043 // The encoding starts at 0 for level 1
4044 Locality = 3 - Locality;
4045 }
4046
4047 // built the mask value encoding the expected behavior.
4048 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4049 (!IsData << 3) | // IsDataCache bit
4050 (Locality << 1) | // Cache level bits
4051 (unsigned)IsStream; // Stream bit
4052 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4053 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4054 Op.getOperand(1));
4055}
4056
4057SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4058 SelectionDAG &DAG) const {
4059 EVT VT = Op.getValueType();
4060 if (VT.isScalableVector())
4061 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4062
4063 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4064 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4065
4066 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4067 return SDValue();
4068}
4069
4070SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4071 SelectionDAG &DAG) const {
4072 EVT VT = Op.getValueType();
4073 if (VT.isScalableVector())
4074 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4075
4076 bool IsStrict = Op->isStrictFPOpcode();
4077 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4078 EVT SrcVT = SrcVal.getValueType();
4079 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4080
4081 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4082 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4083
4084 // Expand cases where the result type is BF16 but we don't have hardware
4085 // instructions to lower it.
4086 if (VT.getScalarType() == MVT::bf16 &&
4087 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4088 Subtarget->hasBF16())) {
4089 SDLoc dl(Op);
4090 SDValue Narrow = SrcVal;
4091 SDValue NaN;
4092 EVT I32 = SrcVT.changeElementType(MVT::i32);
4093 EVT F32 = SrcVT.changeElementType(MVT::f32);
4094 if (SrcVT.getScalarType() == MVT::f32) {
4095 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4096 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4097 if (!NeverSNaN) {
4098 // Set the quiet bit.
4099 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4100 DAG.getConstant(0x400000, dl, I32));
4101 }
4102 } else if (SrcVT.getScalarType() == MVT::f64) {
4103 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4104 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4105 } else {
4106 return SDValue();
4107 }
4108 if (!Trunc) {
4109 SDValue One = DAG.getConstant(1, dl, I32);
4110 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4111 DAG.getShiftAmountConstant(16, I32, dl));
4112 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4113 SDValue RoundingBias =
4114 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4115 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4116 }
4117
4118 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4119 // 0x80000000.
4120 if (NaN) {
4121 SDValue IsNaN = DAG.getSetCC(
4122 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4123 SrcVal, SrcVal, ISD::SETUO);
4124 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4125 }
4126
4127 // Now that we have rounded, shift the bits into position.
4128 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4129 DAG.getShiftAmountConstant(16, I32, dl));
4130 if (VT.isVector()) {
4131 EVT I16 = I32.changeVectorElementType(MVT::i16);
4132 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4133 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4134 }
4135 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4136 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4137 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4138 : Result;
4139 }
4140
4141 if (SrcVT != MVT::f128) {
4142 // Expand cases where the input is a vector bigger than NEON.
4144 return SDValue();
4145
4146 // It's legal except when f128 is involved
4147 return Op;
4148 }
4149
4150 return SDValue();
4151}
4152
4153SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4154 SelectionDAG &DAG) const {
4155 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4156 // Any additional optimization in this function should be recorded
4157 // in the cost tables.
4158 bool IsStrict = Op->isStrictFPOpcode();
4159 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4160 EVT VT = Op.getValueType();
4161
4162 if (VT.isScalableVector()) {
4163 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4166 return LowerToPredicatedOp(Op, DAG, Opcode);
4167 }
4168
4169 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4170 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4171 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4172
4173 unsigned NumElts = InVT.getVectorNumElements();
4174
4175 // f16 conversions are promoted to f32 when full fp16 is not supported.
4176 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4177 InVT.getVectorElementType() == MVT::bf16) {
4178 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4179 SDLoc dl(Op);
4180 if (IsStrict) {
4181 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4182 {Op.getOperand(0), Op.getOperand(1)});
4183 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4184 {Ext.getValue(1), Ext.getValue(0)});
4185 }
4186 return DAG.getNode(
4187 Op.getOpcode(), dl, Op.getValueType(),
4188 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4189 }
4190
4191 uint64_t VTSize = VT.getFixedSizeInBits();
4192 uint64_t InVTSize = InVT.getFixedSizeInBits();
4193 if (VTSize < InVTSize) {
4194 SDLoc dl(Op);
4195 if (IsStrict) {
4197 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4198 {Op.getOperand(0), Op.getOperand(1)});
4199 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4200 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4201 }
4202 SDValue Cv =
4203 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4204 Op.getOperand(0));
4205 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4206 }
4207
4208 if (VTSize > InVTSize) {
4209 SDLoc dl(Op);
4210 MVT ExtVT =
4213 if (IsStrict) {
4214 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4215 {Op.getOperand(0), Op.getOperand(1)});
4216 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4217 {Ext.getValue(1), Ext.getValue(0)});
4218 }
4219 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4220 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4221 }
4222
4223 // Use a scalar operation for conversions between single-element vectors of
4224 // the same size.
4225 if (NumElts == 1) {
4226 SDLoc dl(Op);
4227 SDValue Extract = DAG.getNode(
4229 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4230 EVT ScalarVT = VT.getScalarType();
4231 if (IsStrict)
4232 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4233 {Op.getOperand(0), Extract});
4234 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4235 }
4236
4237 // Type changing conversions are illegal.
4238 return Op;
4239}
4240
4241SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4242 SelectionDAG &DAG) const {
4243 bool IsStrict = Op->isStrictFPOpcode();
4244 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4245
4246 if (SrcVal.getValueType().isVector())
4247 return LowerVectorFP_TO_INT(Op, DAG);
4248
4249 // f16 conversions are promoted to f32 when full fp16 is not supported.
4250 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4251 SrcVal.getValueType() == MVT::bf16) {
4252 SDLoc dl(Op);
4253 if (IsStrict) {
4254 SDValue Ext =
4255 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4256 {Op.getOperand(0), SrcVal});
4257 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4258 {Ext.getValue(1), Ext.getValue(0)});
4259 }
4260 return DAG.getNode(
4261 Op.getOpcode(), dl, Op.getValueType(),
4262 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4263 }
4264
4265 if (SrcVal.getValueType() != MVT::f128) {
4266 // It's legal except when f128 is involved
4267 return Op;
4268 }
4269
4270 return SDValue();
4271}
4272
4273SDValue
4274AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4275 SelectionDAG &DAG) const {
4276 // AArch64 FP-to-int conversions saturate to the destination element size, so
4277 // we can lower common saturating conversions to simple instructions.
4278 SDValue SrcVal = Op.getOperand(0);
4279 EVT SrcVT = SrcVal.getValueType();
4280 EVT DstVT = Op.getValueType();
4281 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4282
4283 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4284 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4285 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4286 assert(SatWidth <= DstElementWidth &&
4287 "Saturation width cannot exceed result width");
4288
4289 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4290 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4291 // types, so this is hard to reach.
4292 if (DstVT.isScalableVector())
4293 return SDValue();
4294
4295 EVT SrcElementVT = SrcVT.getVectorElementType();
4296
4297 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4298 if ((SrcElementVT == MVT::f16 &&
4299 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4300 SrcElementVT == MVT::bf16) {
4301 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4302 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4303 SrcVT = F32VT;
4304 SrcElementVT = MVT::f32;
4305 SrcElementWidth = 32;
4306 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4307 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4308 return SDValue();
4309
4310 SDLoc DL(Op);
4311 // Cases that we can emit directly.
4312 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4313 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4314 DAG.getValueType(DstVT.getScalarType()));
4315
4316 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4317 // result. This is only valid if the legal cvt is larger than the saturate
4318 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4319 // (at least until sqxtn is selected).
4320 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4321 return SDValue();
4322
4323 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4324 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4325 DAG.getValueType(IntVT.getScalarType()));
4326 SDValue Sat;
4327 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4328 SDValue MinC = DAG.getConstant(
4329 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4330 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4331 SDValue MaxC = DAG.getConstant(
4332 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4333 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4334 } else {
4335 SDValue MinC = DAG.getConstant(
4336 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4337 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4338 }
4339
4340 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4341}
4342
4343SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4344 SelectionDAG &DAG) const {
4345 // AArch64 FP-to-int conversions saturate to the destination register size, so
4346 // we can lower common saturating conversions to simple instructions.
4347 SDValue SrcVal = Op.getOperand(0);
4348 EVT SrcVT = SrcVal.getValueType();
4349
4350 if (SrcVT.isVector())
4351 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4352
4353 EVT DstVT = Op.getValueType();
4354 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4355 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4356 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4357 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4358
4359 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4360 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4361 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4362 SrcVT = MVT::f32;
4363 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4364 SrcVT != MVT::bf16)
4365 return SDValue();
4366
4367 SDLoc DL(Op);
4368 // Cases that we can emit directly.
4369 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4370 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4371 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4372 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4373 DAG.getValueType(DstVT));
4374
4375 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4376 // result. This is only valid if the legal cvt is larger than the saturate
4377 // width.
4378 if (DstWidth < SatWidth)
4379 return SDValue();
4380
4381 SDValue NativeCvt =
4382 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4383 SDValue Sat;
4384 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4385 SDValue MinC = DAG.getConstant(
4386 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4387 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4388 SDValue MaxC = DAG.getConstant(
4389 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4390 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4391 } else {
4392 SDValue MinC = DAG.getConstant(
4393 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4394 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4395 }
4396
4397 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4398}
4399
4400SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4401 SelectionDAG &DAG) const {
4402 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4403 // Any additional optimization in this function should be recorded
4404 // in the cost tables.
4405 bool IsStrict = Op->isStrictFPOpcode();
4406 EVT VT = Op.getValueType();
4407 SDLoc dl(Op);
4408 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4409 EVT InVT = In.getValueType();
4410 unsigned Opc = Op.getOpcode();
4411 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4412
4413 if (VT.isScalableVector()) {
4414 if (InVT.getVectorElementType() == MVT::i1) {
4415 // We can't directly extend an SVE predicate; extend it first.
4416 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4417 EVT CastVT = getPromotedVTForPredicate(InVT);
4418 In = DAG.getNode(CastOpc, dl, CastVT, In);
4419 return DAG.getNode(Opc, dl, VT, In);
4420 }
4421
4422 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4424 return LowerToPredicatedOp(Op, DAG, Opcode);
4425 }
4426
4427 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4428 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4429 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4430
4431 // Promote bf16 conversions to f32.
4432 if (VT.getVectorElementType() == MVT::bf16) {
4433 EVT F32 = VT.changeElementType(MVT::f32);
4434 if (IsStrict) {
4435 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4436 {Op.getOperand(0), In});
4437 return DAG.getNode(
4438 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4439 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4440 }
4441 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4442 DAG.getNode(Op.getOpcode(), dl, F32, In),
4443 DAG.getIntPtrConstant(0, dl));
4444 }
4445
4446 uint64_t VTSize = VT.getFixedSizeInBits();
4447 uint64_t InVTSize = InVT.getFixedSizeInBits();
4448 if (VTSize < InVTSize) {
4449 MVT CastVT =
4451 InVT.getVectorNumElements());
4452 if (IsStrict) {
4453 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4454 {Op.getOperand(0), In});
4455 return DAG.getNode(
4456 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4457 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4458 }
4459 In = DAG.getNode(Opc, dl, CastVT, In);
4460 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4461 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4462 }
4463
4464 if (VTSize > InVTSize) {
4465 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4467 In = DAG.getNode(CastOpc, dl, CastVT, In);
4468 if (IsStrict)
4469 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4470 return DAG.getNode(Opc, dl, VT, In);
4471 }
4472
4473 // Use a scalar operation for conversions between single-element vectors of
4474 // the same size.
4475 if (VT.getVectorNumElements() == 1) {
4476 SDValue Extract = DAG.getNode(
4478 In, DAG.getConstant(0, dl, MVT::i64));
4479 EVT ScalarVT = VT.getScalarType();
4480 if (IsStrict)
4481 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4482 {Op.getOperand(0), Extract});
4483 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4484 }
4485
4486 return Op;
4487}
4488
4489SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4490 SelectionDAG &DAG) const {
4491 if (Op.getValueType().isVector())
4492 return LowerVectorINT_TO_FP(Op, DAG);
4493
4494 bool IsStrict = Op->isStrictFPOpcode();
4495 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4496
4497 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4498 Op->getOpcode() == ISD::SINT_TO_FP;
4499
4500 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4501 SDLoc dl(Op);
4502 if (IsStrict) {
4503 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4504 {Op.getOperand(0), SrcVal});
4505 return DAG.getNode(
4506 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4507 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4508 }
4509 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4510 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4511 DAG.getIntPtrConstant(0, dl));
4512 };
4513
4514 if (Op.getValueType() == MVT::bf16) {
4515 unsigned MaxWidth = IsSigned
4516 ? DAG.ComputeMaxSignificantBits(SrcVal)
4517 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4518 // bf16 conversions are promoted to f32 when converting from i16.
4519 if (MaxWidth <= 24) {
4520 return IntToFpViaPromotion(MVT::f32);
4521 }
4522
4523 // bf16 conversions are promoted to f64 when converting from i32.
4524 if (MaxWidth <= 53) {
4525 return IntToFpViaPromotion(MVT::f64);
4526 }
4527
4528 // We need to be careful about i64 -> bf16.
4529 // Consider an i32 22216703.
4530 // This number cannot be represented exactly as an f32 and so a itofp will
4531 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4532 // However, the correct bf16 was supposed to be 22151168.0
4533 // We need to use sticky rounding to get this correct.
4534 if (SrcVal.getValueType() == MVT::i64) {
4535 SDLoc DL(Op);
4536 // This algorithm is equivalent to the following:
4537 // uint64_t SrcHi = SrcVal & ~0xfffull;
4538 // uint64_t SrcLo = SrcVal & 0xfffull;
4539 // uint64_t Highest = SrcVal >> 53;
4540 // bool HasHighest = Highest != 0;
4541 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4542 // double Rounded = static_cast<double>(ToRound);
4543 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4544 // uint64_t HasLo = SrcLo != 0;
4545 // bool NeedsAdjustment = HasHighest & HasLo;
4546 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4547 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4548 // return static_cast<__bf16>(Adjusted);
4549 //
4550 // Essentially, what happens is that SrcVal either fits perfectly in a
4551 // double-precision value or it is too big. If it is sufficiently small,
4552 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4553 // ensure that u64 -> double has no rounding error by only using the 52
4554 // MSB of the input. The low order bits will get merged into a sticky bit
4555 // which will avoid issues incurred by double rounding.
4556
4557 // Signed conversion is more or less like so:
4558 // copysign((__bf16)abs(SrcVal), SrcVal)
4559 SDValue SignBit;
4560 if (IsSigned) {
4561 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4562 DAG.getConstant(1ull << 63, DL, MVT::i64));
4563 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4564 }
4565 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4566 DAG.getConstant(~0xfffull, DL, MVT::i64));
4567 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4568 DAG.getConstant(0xfffull, DL, MVT::i64));
4570 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4571 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4572 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4573 SDValue ToRound =
4574 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4575 SDValue Rounded =
4576 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4577 {Op.getOperand(0), ToRound})
4578 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4579
4580 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4581 if (SignBit) {
4582 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4583 }
4584
4585 SDValue HasHighest = DAG.getSetCC(
4586 DL,
4587 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4588 Highest, Zero64, ISD::SETNE);
4589
4590 SDValue HasLo = DAG.getSetCC(
4591 DL,
4592 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4593 SrcLo, Zero64, ISD::SETNE);
4594
4595 SDValue NeedsAdjustment =
4596 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4597 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4598
4599 SDValue AdjustedBits =
4600 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4601 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4602 return IsStrict
4604 {Op.getValueType(), MVT::Other},
4605 {Rounded.getValue(1), Adjusted,
4606 DAG.getIntPtrConstant(0, DL)})
4607 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4608 DAG.getIntPtrConstant(0, DL, true));
4609 }
4610 }
4611
4612 // f16 conversions are promoted to f32 when full fp16 is not supported.
4613 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4614 return IntToFpViaPromotion(MVT::f32);
4615 }
4616
4617 // i128 conversions are libcalls.
4618 if (SrcVal.getValueType() == MVT::i128)
4619 return SDValue();
4620
4621 // Other conversions are legal, unless it's to the completely software-based
4622 // fp128.
4623 if (Op.getValueType() != MVT::f128)
4624 return Op;
4625 return SDValue();
4626}
4627
4628SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4629 SelectionDAG &DAG) const {
4630 // For iOS, we want to call an alternative entry point: __sincos_stret,
4631 // which returns the values in two S / D registers.
4632 SDLoc dl(Op);
4633 SDValue Arg = Op.getOperand(0);
4634 EVT ArgVT = Arg.getValueType();
4635 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4636
4638 ArgListEntry Entry;
4639
4640 Entry.Node = Arg;
4641 Entry.Ty = ArgTy;
4642 Entry.IsSExt = false;
4643 Entry.IsZExt = false;
4644 Args.push_back(Entry);
4645
4646 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4647 : RTLIB::SINCOS_STRET_F32;
4648 const char *LibcallName = getLibcallName(LC);
4649 SDValue Callee =
4650 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4651
4652 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4654 CLI.setDebugLoc(dl)
4655 .setChain(DAG.getEntryNode())
4656 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4657
4658 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4659 return CallResult.first;
4660}
4661
4662static MVT getSVEContainerType(EVT ContentTy);
4663
4664SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4665 SelectionDAG &DAG) const {
4666 EVT OpVT = Op.getValueType();
4667 EVT ArgVT = Op.getOperand(0).getValueType();
4668
4670 return LowerFixedLengthBitcastToSVE(Op, DAG);
4671
4672 if (OpVT.isScalableVector()) {
4673 // Bitcasting between unpacked vector types of different element counts is
4674 // not a NOP because the live elements are laid out differently.
4675 // 01234567
4676 // e.g. nxv2i32 = XX??XX??
4677 // nxv4f16 = X?X?X?X?
4678 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4679 return SDValue();
4680
4681 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4682 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4683 "Expected int->fp bitcast!");
4684 SDValue ExtResult =
4686 Op.getOperand(0));
4687 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4688 }
4689 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4690 }
4691
4692 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4693 return SDValue();
4694
4695 // Bitcasts between f16 and bf16 are legal.
4696 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4697 return Op;
4698
4699 assert(ArgVT == MVT::i16);
4700 SDLoc DL(Op);
4701
4702 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4703 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4704 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4705}
4706
4707static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4708 if (OrigVT.getSizeInBits() >= 64)
4709 return OrigVT;
4710
4711 assert(OrigVT.isSimple() && "Expecting a simple value type");
4712
4713 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4714 switch (OrigSimpleTy) {
4715 default: llvm_unreachable("Unexpected Vector Type");
4716 case MVT::v2i8:
4717 case MVT::v2i16:
4718 return MVT::v2i32;
4719 case MVT::v4i8:
4720 return MVT::v4i16;
4721 }
4722}
4723
4725 const EVT &OrigTy,
4726 const EVT &ExtTy,
4727 unsigned ExtOpcode) {
4728 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4729 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4730 // 64-bits we need to insert a new extension so that it will be 64-bits.
4731 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4732 if (OrigTy.getSizeInBits() >= 64)
4733 return N;
4734
4735 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4736 EVT NewVT = getExtensionTo64Bits(OrigTy);
4737
4738 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4739}
4740
4741// Returns lane if Op extracts from a two-element vector and lane is constant
4742// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4743static std::optional<uint64_t>
4745 SDNode *OpNode = Op.getNode();
4746 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4747 return std::nullopt;
4748
4749 EVT VT = OpNode->getOperand(0).getValueType();
4750 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4751 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4752 return std::nullopt;
4753
4754 return C->getZExtValue();
4755}
4756
4758 bool isSigned) {
4759 EVT VT = N.getValueType();
4760
4761 if (N.getOpcode() != ISD::BUILD_VECTOR)
4762 return false;
4763
4764 for (const SDValue &Elt : N->op_values()) {
4765 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4766 unsigned EltSize = VT.getScalarSizeInBits();
4767 unsigned HalfSize = EltSize / 2;
4768 if (isSigned) {
4769 if (!isIntN(HalfSize, C->getSExtValue()))
4770 return false;
4771 } else {
4772 if (!isUIntN(HalfSize, C->getZExtValue()))
4773 return false;
4774 }
4775 continue;
4776 }
4777 return false;
4778 }
4779
4780 return true;
4781}
4782
4784 EVT VT = N.getValueType();
4785 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4786
4787 unsigned NumElts = VT.getVectorNumElements();
4788 unsigned OrigEltSize = VT.getScalarSizeInBits();
4789 unsigned EltSize = OrigEltSize / 2;
4790 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4791
4792 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4793 if (DAG.MaskedValueIsZero(N, HiBits))
4794 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4795
4796 if (ISD::isExtOpcode(N.getOpcode()))
4797 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4798 N.getOperand(0).getValueType(), VT,
4799 N.getOpcode());
4800
4801 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4802 SDLoc dl(N);
4804 for (unsigned i = 0; i != NumElts; ++i) {
4805 const APInt &CInt = N.getConstantOperandAPInt(i);
4806 // Element types smaller than 32 bits are not legal, so use i32 elements.
4807 // The values are implicitly truncated so sext vs. zext doesn't matter.
4808 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4809 }
4810 return DAG.getBuildVector(TruncVT, dl, Ops);
4811}
4812
4814 return N.getOpcode() == ISD::SIGN_EXTEND ||
4815 N.getOpcode() == ISD::ANY_EXTEND ||
4816 isExtendedBUILD_VECTOR(N, DAG, true);
4817}
4818
4820 return N.getOpcode() == ISD::ZERO_EXTEND ||
4821 N.getOpcode() == ISD::ANY_EXTEND ||
4822 isExtendedBUILD_VECTOR(N, DAG, false);
4823}
4824
4826 unsigned Opcode = N.getOpcode();
4827 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4828 SDValue N0 = N.getOperand(0);
4829 SDValue N1 = N.getOperand(1);
4830 return N0->hasOneUse() && N1->hasOneUse() &&
4831 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4832 }
4833 return false;
4834}
4835
4837 unsigned Opcode = N.getOpcode();
4838 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4839 SDValue N0 = N.getOperand(0);
4840 SDValue N1 = N.getOperand(1);
4841 return N0->hasOneUse() && N1->hasOneUse() &&
4842 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4843 }
4844 return false;
4845}
4846
4847SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4848 SelectionDAG &DAG) const {
4849 // The rounding mode is in bits 23:22 of the FPSCR.
4850 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4851 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4852 // so that the shift + and get folded into a bitfield extract.
4853 SDLoc dl(Op);
4854
4855 SDValue Chain = Op.getOperand(0);
4856 SDValue FPCR_64 = DAG.getNode(
4857 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4858 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4859 Chain = FPCR_64.getValue(1);
4860 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4861 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4862 DAG.getConstant(1U << 22, dl, MVT::i32));
4863 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4864 DAG.getConstant(22, dl, MVT::i32));
4865 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4866 DAG.getConstant(3, dl, MVT::i32));
4867 return DAG.getMergeValues({AND, Chain}, dl);
4868}
4869
4870SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4871 SelectionDAG &DAG) const {
4872 SDLoc DL(Op);
4873 SDValue Chain = Op->getOperand(0);
4874 SDValue RMValue = Op->getOperand(1);
4875
4876 // The rounding mode is in bits 23:22 of the FPCR.
4877 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4878 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4879 // ((arg - 1) & 3) << 22).
4880 //
4881 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4882 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4883 // generated llvm.set.rounding to ensure this condition.
4884
4885 // Calculate new value of FPCR[23:22].
4886 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4887 DAG.getConstant(1, DL, MVT::i32));
4888 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4889 DAG.getConstant(0x3, DL, MVT::i32));
4890 RMValue =
4891 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4892 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4893 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4894
4895 // Get current value of FPCR.
4896 SDValue Ops[] = {
4897 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4898 SDValue FPCR =
4899 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4900 Chain = FPCR.getValue(1);
4901 FPCR = FPCR.getValue(0);
4902
4903 // Put new rounding mode into FPSCR[23:22].
4904 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4905 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4906 DAG.getConstant(RMMask, DL, MVT::i64));
4907 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4908 SDValue Ops2[] = {
4909 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4910 FPCR};
4911 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4912}
4913
4914static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4915 SDLoc DL, bool &IsMLA) {
4916 bool IsN0SExt = isSignExtended(N0, DAG);
4917 bool IsN1SExt = isSignExtended(N1, DAG);
4918 if (IsN0SExt && IsN1SExt)
4919 return AArch64ISD::SMULL;
4920
4921 bool IsN0ZExt = isZeroExtended(N0, DAG);
4922 bool IsN1ZExt = isZeroExtended(N1, DAG);
4923
4924 if (IsN0ZExt && IsN1ZExt)
4925 return AArch64ISD::UMULL;
4926
4927 // Select SMULL if we can replace zext with sext.
4928 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4929 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4930 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4931 SDValue ZextOperand;
4932 if (IsN0ZExt)
4933 ZextOperand = N0.getOperand(0);
4934 else
4935 ZextOperand = N1.getOperand(0);
4936 if (DAG.SignBitIsZero(ZextOperand)) {
4937 SDValue NewSext =
4938 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4939 if (IsN0ZExt)
4940 N0 = NewSext;
4941 else
4942 N1 = NewSext;
4943 return AArch64ISD::SMULL;
4944 }
4945 }
4946
4947 // Select UMULL if we can replace the other operand with an extend.
4948 if (IsN0ZExt || IsN1ZExt) {
4949 EVT VT = N0.getValueType();
4951 VT.getScalarSizeInBits() / 2);
4952 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4953 return AArch64ISD::UMULL;
4954 }
4955
4956 if (!IsN1SExt && !IsN1ZExt)
4957 return 0;
4958
4959 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4960 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4961 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4962 IsMLA = true;
4963 return AArch64ISD::SMULL;
4964 }
4965 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4966 IsMLA = true;
4967 return AArch64ISD::UMULL;
4968 }
4969 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4970 std::swap(N0, N1);
4971 IsMLA = true;
4972 return AArch64ISD::UMULL;
4973 }
4974 return 0;
4975}
4976
4977SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4978 EVT VT = Op.getValueType();
4979
4980 bool OverrideNEON = !Subtarget->isNeonAvailable();
4981 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4982 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4983
4984 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4985 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4986 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
4987 "unexpected type for custom-lowering ISD::MUL");
4988 SDValue N0 = Op.getOperand(0);
4989 SDValue N1 = Op.getOperand(1);
4990 bool isMLA = false;
4991 EVT OVT = VT;
4992 if (VT.is64BitVector()) {
4993 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4994 isNullConstant(N0.getOperand(1)) &&
4996 isNullConstant(N1.getOperand(1))) {
4997 N0 = N0.getOperand(0);
4998 N1 = N1.getOperand(0);
4999 VT = N0.getValueType();
5000 } else {
5001 if (VT == MVT::v1i64) {
5002 if (Subtarget->hasSVE())
5003 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5004 // Fall through to expand this. It is not legal.
5005 return SDValue();
5006 } else
5007 // Other vector multiplications are legal.
5008 return Op;
5009 }
5010 }
5011
5012 SDLoc DL(Op);
5013 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5014
5015 if (!NewOpc) {
5016 if (VT.getVectorElementType() == MVT::i64) {
5017 // If SVE is available then i64 vector multiplications can also be made
5018 // legal.
5019 if (Subtarget->hasSVE())
5020 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5021 // Fall through to expand this. It is not legal.
5022 return SDValue();
5023 } else
5024 // Other vector multiplications are legal.
5025 return Op;
5026 }
5027
5028 // Legalize to a S/UMULL instruction
5029 SDValue Op0;
5030 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5031 if (!isMLA) {
5032 Op0 = skipExtensionForVectorMULL(N0, DAG);
5034 Op1.getValueType().is64BitVector() &&
5035 "unexpected types for extended operands to VMULL");
5036 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5037 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5038 DAG.getConstant(0, DL, MVT::i64));
5039 }
5040 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5041 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5042 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5045 EVT Op1VT = Op1.getValueType();
5046 return DAG.getNode(
5048 DAG.getNode(N0.getOpcode(), DL, VT,
5049 DAG.getNode(NewOpc, DL, VT,
5050 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5051 DAG.getNode(NewOpc, DL, VT,
5052 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5053 DAG.getConstant(0, DL, MVT::i64));
5054}
5055
5056static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5057 int Pattern) {
5058 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5059 return DAG.getConstant(1, DL, MVT::nxv1i1);
5060 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5061 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5062}
5063
5064static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
5065 bool IsLess, bool IsEqual) {
5066 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5067 !isa<ConstantSDNode>(Op.getOperand(2)))
5068 return SDValue();
5069
5070 SDLoc dl(Op);
5071 APInt X = Op.getConstantOperandAPInt(1);
5072 APInt Y = Op.getConstantOperandAPInt(2);
5073 APInt NumActiveElems;
5074 bool Overflow;
5075 if (IsLess)
5076 NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5077 else
5078 NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow);
5079
5080 if (Overflow)
5081 return SDValue();
5082
5083 if (IsEqual) {
5084 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5085 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5086 : NumActiveElems.uadd_ov(One, Overflow);
5087 if (Overflow)
5088 return SDValue();
5089 }
5090
5091 std::optional<unsigned> PredPattern =
5093 unsigned MinSVEVectorSize = std::max(
5095 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5096 if (PredPattern != std::nullopt &&
5097 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5098 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5099
5100 return SDValue();
5101}
5102
5103// Returns a safe bitcast between two scalable vector predicates, where
5104// any newly created lanes from a widening bitcast are defined as zero.
5106 SDLoc DL(Op);
5107 EVT InVT = Op.getValueType();
5108
5109 assert(InVT.getVectorElementType() == MVT::i1 &&
5110 VT.getVectorElementType() == MVT::i1 &&
5111 "Expected a predicate-to-predicate bitcast");
5113 InVT.isScalableVector() &&
5114 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5115 "Only expect to cast between legal scalable predicate types!");
5116
5117 // Return the operand if the cast isn't changing type,
5118 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5119 if (InVT == VT)
5120 return Op;
5121
5122 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5123
5124 // We only have to zero the lanes if new lanes are being defined, e.g. when
5125 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5126 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5127 // we can return here.
5128 if (InVT.bitsGT(VT))
5129 return Reinterpret;
5130
5131 // Check if the other lanes are already known to be zeroed by
5132 // construction.
5134 return Reinterpret;
5135
5136 // Zero the newly introduced lanes.
5137 SDValue Mask = DAG.getConstant(1, DL, InVT);
5138 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5139 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5140}
5141
5142SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5143 SDValue Chain, SDLoc DL,
5144 EVT VT) const {
5145 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5147 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5148 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5151 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5153 RetTy, Callee, std::move(Args));
5154 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5155 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5156 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5157 Mask);
5158}
5159
5160// Lower an SME LDR/STR ZA intrinsic
5161// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5162// folded into the instruction
5163// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5164// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5165// and tile slice registers
5166// ldr(%tileslice, %ptr, %vecnum)
5167// ->
5168// %svl = rdsvl
5169// %ptr2 = %ptr + %svl * %vecnum
5170// %tileslice2 = %tileslice + %vecnum
5171// ldr [%tileslice2, 0], [%ptr2, 0]
5172// Case 3: If the vecnum is an immediate out of range, then the same is done as
5173// case 2, but the base and slice registers are modified by the greatest
5174// multiple of 15 lower than the vecnum and the remainder is folded into the
5175// instruction. This means that successive loads and stores that are offset from
5176// each other can share the same base and slice register updates.
5177// ldr(%tileslice, %ptr, 22)
5178// ldr(%tileslice, %ptr, 23)
5179// ->
5180// %svl = rdsvl
5181// %ptr2 = %ptr + %svl * 15
5182// %tileslice2 = %tileslice + 15
5183// ldr [%tileslice2, 7], [%ptr2, 7]
5184// ldr [%tileslice2, 8], [%ptr2, 8]
5185// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5186// operand and the immediate can be folded into the instruction, like case 2.
5187// ldr(%tileslice, %ptr, %vecnum + 7)
5188// ldr(%tileslice, %ptr, %vecnum + 8)
5189// ->
5190// %svl = rdsvl
5191// %ptr2 = %ptr + %svl * %vecnum
5192// %tileslice2 = %tileslice + %vecnum
5193// ldr [%tileslice2, 7], [%ptr2, 7]
5194// ldr [%tileslice2, 8], [%ptr2, 8]
5195// Case 5: The vecnum being an add of an immediate out of range is also handled,
5196// in which case the same remainder logic as case 3 is used.
5198 SDLoc DL(N);
5199
5200 SDValue TileSlice = N->getOperand(2);
5201 SDValue Base = N->getOperand(3);
5202 SDValue VecNum = N->getOperand(4);
5203 int32_t ConstAddend = 0;
5204 SDValue VarAddend = VecNum;
5205
5206 // If the vnum is an add of an immediate, we can fold it into the instruction
5207 if (VecNum.getOpcode() == ISD::ADD &&
5208 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5209 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5210 VarAddend = VecNum.getOperand(0);
5211 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5212 ConstAddend = ImmNode->getSExtValue();
5213 VarAddend = SDValue();
5214 }
5215
5216 int32_t ImmAddend = ConstAddend % 16;
5217 if (int32_t C = (ConstAddend - ImmAddend)) {
5218 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5219 VarAddend = VarAddend
5220 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5221 : CVal;
5222 }
5223
5224 if (VarAddend) {
5225 // Get the vector length that will be multiplied by vnum
5226 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5227 DAG.getConstant(1, DL, MVT::i32));
5228
5229 // Multiply SVL and vnum then add it to the base
5230 SDValue Mul = DAG.getNode(
5231 ISD::MUL, DL, MVT::i64,
5232 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5233 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5234 // Just add vnum to the tileslice
5235 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5236 }
5237
5239 DL, MVT::Other,
5240 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5241 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5242}
5243
5244SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5245 SelectionDAG &DAG) const {
5246 unsigned IntNo = Op.getConstantOperandVal(1);
5247 SDLoc DL(Op);
5248 switch (IntNo) {
5249 default:
5250 return SDValue(); // Don't custom lower most intrinsics.
5251 case Intrinsic::aarch64_prefetch: {
5252 SDValue Chain = Op.getOperand(0);
5253 SDValue Addr = Op.getOperand(2);
5254
5255 unsigned IsWrite = Op.getConstantOperandVal(3);
5256 unsigned Locality = Op.getConstantOperandVal(4);
5257 unsigned IsStream = Op.getConstantOperandVal(5);
5258 unsigned IsData = Op.getConstantOperandVal(6);
5259 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5260 (!IsData << 3) | // IsDataCache bit
5261 (Locality << 1) | // Cache level bits
5262 (unsigned)IsStream; // Stream bit
5263
5264 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5265 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5266 }
5267 case Intrinsic::aarch64_sme_str:
5268 case Intrinsic::aarch64_sme_ldr: {
5269 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5270 }
5271 case Intrinsic::aarch64_sme_za_enable:
5272 return DAG.getNode(
5273 AArch64ISD::SMSTART, DL, MVT::Other,
5274 Op->getOperand(0), // Chain
5275 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5276 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5277 case Intrinsic::aarch64_sme_za_disable:
5278 return DAG.getNode(
5279 AArch64ISD::SMSTOP, DL, MVT::Other,
5280 Op->getOperand(0), // Chain
5281 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5282 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5283 }
5284}
5285
5286SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5287 SelectionDAG &DAG) const {
5288 unsigned IntNo = Op.getConstantOperandVal(1);
5289 SDLoc DL(Op);
5290 switch (IntNo) {
5291 default:
5292 return SDValue(); // Don't custom lower most intrinsics.
5293 case Intrinsic::aarch64_mops_memset_tag: {
5294 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5295 SDValue Chain = Node->getChain();
5296 SDValue Dst = Op.getOperand(2);
5297 SDValue Val = Op.getOperand(3);
5298 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5299 SDValue Size = Op.getOperand(4);
5300 auto Alignment = Node->getMemOperand()->getAlign();
5301 bool IsVol = Node->isVolatile();
5302 auto DstPtrInfo = Node->getPointerInfo();
5303
5304 const auto &SDI =
5305 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5306 SDValue MS =
5307 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5308 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5309
5310 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5311 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5312 // LowerOperationWrapper will complain that the number of results has
5313 // changed.
5314 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5315 }
5316 }
5317}
5318
5319SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5320 SelectionDAG &DAG) const {
5321 unsigned IntNo = Op.getConstantOperandVal(0);
5322 SDLoc dl(Op);
5323 switch (IntNo) {
5324 default: return SDValue(); // Don't custom lower most intrinsics.
5325 case Intrinsic::thread_pointer: {
5326 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5327 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5328 }
5329 case Intrinsic::aarch64_neon_abs: {
5330 EVT Ty = Op.getValueType();
5331 if (Ty == MVT::i64) {
5332 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5333 Op.getOperand(1));
5334 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5335 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5336 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5337 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5338 } else {
5339 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5340 }
5341 }
5342 case Intrinsic::aarch64_neon_pmull64: {
5343 SDValue LHS = Op.getOperand(1);
5344 SDValue RHS = Op.getOperand(2);
5345
5346 std::optional<uint64_t> LHSLane =
5348 std::optional<uint64_t> RHSLane =
5350
5351 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5352 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5353
5354 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5355 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5356 // which ISel recognizes better. For example, generate a ldr into d*
5357 // registers as opposed to a GPR load followed by a fmov.
5358 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5359 std::optional<uint64_t> OtherLane,
5360 const SDLoc &dl,
5361 SelectionDAG &DAG) -> SDValue {
5362 // If the operand is an higher half itself, rewrite it to
5363 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5364 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5365 if (NLane && *NLane == 1)
5366 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5367 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5368
5369 // Operand N is not a higher half but the other operand is.
5370 if (OtherLane && *OtherLane == 1) {
5371 // If this operand is a lower half, rewrite it to
5372 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5373 // align lanes of two operands. A roundtrip sequence (to move from lane
5374 // 1 to lane 0) is like this:
5375 // mov x8, v0.d[1]
5376 // fmov d0, x8
5377 if (NLane && *NLane == 0)
5378 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5379 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5380 N.getOperand(0),
5381 DAG.getConstant(0, dl, MVT::i64)),
5382 DAG.getConstant(1, dl, MVT::i64));
5383
5384 // Otherwise just dup from main to all lanes.
5385 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5386 }
5387
5388 // Neither operand is an extract of higher half, so codegen may just use
5389 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5390 assert(N.getValueType() == MVT::i64 &&
5391 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5392 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5393 };
5394
5395 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5396 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5397
5398 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5399 }
5400 case Intrinsic::aarch64_neon_smax:
5401 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5402 Op.getOperand(1), Op.getOperand(2));
5403 case Intrinsic::aarch64_neon_umax:
5404 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5405 Op.getOperand(1), Op.getOperand(2));
5406 case Intrinsic::aarch64_neon_smin:
5407 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5408 Op.getOperand(1), Op.getOperand(2));
5409 case Intrinsic::aarch64_neon_umin:
5410 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5411 Op.getOperand(1), Op.getOperand(2));
5412 case Intrinsic::aarch64_neon_scalar_sqxtn:
5413 case Intrinsic::aarch64_neon_scalar_sqxtun:
5414 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5415 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5416 if (Op.getValueType() == MVT::i32)
5417 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5418 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5419 Op.getOperand(0),
5420 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5421 Op.getOperand(1))));
5422 return SDValue();
5423 }
5424 case Intrinsic::aarch64_sve_whilelo:
5425 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5426 /*IsEqual=*/false);
5427 case Intrinsic::aarch64_sve_whilelt:
5428 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5429 /*IsEqual=*/false);
5430 case Intrinsic::aarch64_sve_whilels:
5431 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5432 /*IsEqual=*/true);
5433 case Intrinsic::aarch64_sve_whilele:
5434 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5435 /*IsEqual=*/true);
5436 case Intrinsic::aarch64_sve_whilege:
5437 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5438 /*IsEqual=*/true);
5439 case Intrinsic::aarch64_sve_whilegt:
5440 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5441 /*IsEqual=*/false);
5442 case Intrinsic::aarch64_sve_whilehs:
5443 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5444 /*IsEqual=*/true);
5445 case Intrinsic::aarch64_sve_whilehi:
5446 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5447 /*IsEqual=*/false);
5448 case Intrinsic::aarch64_sve_sunpkhi:
5449 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5450 Op.getOperand(1));
5451 case Intrinsic::aarch64_sve_sunpklo:
5452 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5453 Op.getOperand(1));
5454 case Intrinsic::aarch64_sve_uunpkhi:
5455 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5456 Op.getOperand(1));
5457 case Intrinsic::aarch64_sve_uunpklo:
5458 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5459 Op.getOperand(1));
5460 case Intrinsic::aarch64_sve_clasta_n:
5461 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5462 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5463 case Intrinsic::aarch64_sve_clastb_n:
5464 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5465 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5466 case Intrinsic::aarch64_sve_lasta:
5467 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5468 Op.getOperand(1), Op.getOperand(2));
5469 case Intrinsic::aarch64_sve_lastb:
5470 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5471 Op.getOperand(1), Op.getOperand(2));
5472 case Intrinsic::aarch64_sve_rev:
5473 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5474 Op.getOperand(1));
5475 case Intrinsic::aarch64_sve_tbl:
5476 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5477 Op.getOperand(1), Op.getOperand(2));
5478 case Intrinsic::aarch64_sve_trn1:
5479 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5480 Op.getOperand(1), Op.getOperand(2));
5481 case Intrinsic::aarch64_sve_trn2:
5482 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5483 Op.getOperand(1), Op.getOperand(2));
5484 case Intrinsic::aarch64_sve_uzp1:
5485 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5486 Op.getOperand(1), Op.getOperand(2));
5487 case Intrinsic::aarch64_sve_uzp2:
5488 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5489 Op.getOperand(1), Op.getOperand(2));
5490 case Intrinsic::aarch64_sve_zip1:
5491 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5492 Op.getOperand(1), Op.getOperand(2));
5493 case Intrinsic::aarch64_sve_zip2:
5494 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5495 Op.getOperand(1), Op.getOperand(2));
5496 case Intrinsic::aarch64_sve_splice:
5497 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5498 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5499 case Intrinsic::aarch64_sve_ptrue:
5500 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5501 case Intrinsic::aarch64_sve_clz:
5502 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5503 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5504 case Intrinsic::aarch64_sme_cntsb:
5505 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5506 DAG.getConstant(1, dl, MVT::i32));
5507 case Intrinsic::aarch64_sme_cntsh: {
5508 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5509 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5510 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5511 }
5512 case Intrinsic::aarch64_sme_cntsw: {
5513 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5514 DAG.getConstant(1, dl, MVT::i32));
5515 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5516 DAG.getConstant(2, dl, MVT::i32));
5517 }
5518 case Intrinsic::aarch64_sme_cntsd: {
5519 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5520 DAG.getConstant(1, dl, MVT::i32));
5521 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5522 DAG.getConstant(3, dl, MVT::i32));
5523 }
5524 case Intrinsic::aarch64_sve_cnt: {
5525 SDValue Data = Op.getOperand(3);
5526 // CTPOP only supports integer operands.
5527 if (Data.getValueType().isFloatingPoint())
5528 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5529 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5530 Op.getOperand(2), Data, Op.getOperand(1));
5531 }
5532 case Intrinsic::aarch64_sve_dupq_lane:
5533 return LowerDUPQLane(Op, DAG);
5534 case Intrinsic::aarch64_sve_convert_from_svbool:
5535 if (Op.getValueType() == MVT::aarch64svcount)
5536 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5537 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5538 case Intrinsic::aarch64_sve_convert_to_svbool:
5539 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5540 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5541 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5542 case Intrinsic::aarch64_sve_fneg:
5543 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5544 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5545 case Intrinsic::aarch64_sve_frintp:
5546 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5547 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5548 case Intrinsic::aarch64_sve_frintm:
5549 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5550 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5551 case Intrinsic::aarch64_sve_frinti:
5552 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5553 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5554 case Intrinsic::aarch64_sve_frintx:
5555 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5556 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5557 case Intrinsic::aarch64_sve_frinta:
5558 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5559 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5560 case Intrinsic::aarch64_sve_frintn:
5561 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5562 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5563 case Intrinsic::aarch64_sve_frintz:
5564 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5565 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5566 case Intrinsic::aarch64_sve_ucvtf:
5568 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5569 Op.getOperand(1));
5570 case Intrinsic::aarch64_sve_scvtf:
5572 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5573 Op.getOperand(1));
5574 case Intrinsic::aarch64_sve_fcvtzu:
5576 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5577 Op.getOperand(1));
5578 case Intrinsic::aarch64_sve_fcvtzs:
5580 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5581 Op.getOperand(1));
5582 case Intrinsic::aarch64_sve_fsqrt:
5583 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5584 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5585 case Intrinsic::aarch64_sve_frecpx:
5586 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5587 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5588 case Intrinsic::aarch64_sve_frecpe_x:
5589 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5590 Op.getOperand(1));
5591 case Intrinsic::aarch64_sve_frecps_x:
5592 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5593 Op.getOperand(1), Op.getOperand(2));
5594 case Intrinsic::aarch64_sve_frsqrte_x:
5595 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5596 Op.getOperand(1));
5597 case Intrinsic::aarch64_sve_frsqrts_x:
5598 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5599 Op.getOperand(1), Op.getOperand(2));
5600 case Intrinsic::aarch64_sve_fabs:
5601 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5602 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5603 case Intrinsic::aarch64_sve_abs:
5604 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5605 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5606 case Intrinsic::aarch64_sve_neg:
5607 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5608 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5609 case Intrinsic::aarch64_sve_insr: {
5610 SDValue Scalar = Op.getOperand(2);
5611 EVT ScalarTy = Scalar.getValueType();
5612 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5613 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5614
5615 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5616 Op.getOperand(1), Scalar);
5617 }
5618 case Intrinsic::aarch64_sve_rbit:
5620 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5621 Op.getOperand(1));
5622 case Intrinsic::aarch64_sve_revb:
5623 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5624 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5625 case Intrinsic::aarch64_sve_revh:
5626 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5627 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5628 case Intrinsic::aarch64_sve_revw:
5629 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5630 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5631 case Intrinsic::aarch64_sve_revd:
5632 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5633 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5634 case Intrinsic::aarch64_sve_sxtb:
5635 return DAG.getNode(
5637 Op.getOperand(2), Op.getOperand(3),
5638 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5639 Op.getOperand(1));
5640 case Intrinsic::aarch64_sve_sxth:
5641 return DAG.getNode(
5643 Op.getOperand(2), Op.getOperand(3),
5644 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5645 Op.getOperand(1));
5646 case Intrinsic::aarch64_sve_sxtw:
5647 return DAG.getNode(
5649 Op.getOperand(2), Op.getOperand(3),
5650 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5651 Op.getOperand(1));
5652 case Intrinsic::aarch64_sve_uxtb:
5653 return DAG.getNode(
5655 Op.getOperand(2), Op.getOperand(3),
5656 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5657 Op.getOperand(1));
5658 case Intrinsic::aarch64_sve_uxth:
5659 return DAG.getNode(
5661 Op.getOperand(2), Op.getOperand(3),
5662 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5663 Op.getOperand(1));
5664 case Intrinsic::aarch64_sve_uxtw:
5665 return DAG.getNode(
5667 Op.getOperand(2), Op.getOperand(3),
5668 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5669 Op.getOperand(1));
5670 case Intrinsic::localaddress: {
5671 const auto &MF = DAG.getMachineFunction();
5672 const auto *RegInfo = Subtarget->getRegisterInfo();
5673 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5674 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5675 Op.getSimpleValueType());
5676 }
5677
5678 case Intrinsic::eh_recoverfp: {
5679 // FIXME: This needs to be implemented to correctly handle highly aligned
5680 // stack objects. For now we simply return the incoming FP. Refer D53541
5681 // for more details.
5682 SDValue FnOp = Op.getOperand(1);
5683 SDValue IncomingFPOp = Op.getOperand(2);
5684 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5685 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5686 if (!Fn)
5688 "llvm.eh.recoverfp must take a function as the first argument");
5689 return IncomingFPOp;
5690 }
5691
5692 case Intrinsic::aarch64_neon_vsri:
5693 case Intrinsic::aarch64_neon_vsli:
5694 case Intrinsic::aarch64_sve_sri:
5695 case Intrinsic::aarch64_sve_sli: {
5696 EVT Ty = Op.getValueType();
5697
5698 if (!Ty.isVector())
5699 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5700
5701 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5702
5703 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5704 IntNo == Intrinsic::aarch64_sve_sri;
5705 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5706 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5707 Op.getOperand(3));
5708 }
5709
5710 case Intrinsic::aarch64_neon_srhadd:
5711 case Intrinsic::aarch64_neon_urhadd:
5712 case Intrinsic::aarch64_neon_shadd:
5713 case Intrinsic::aarch64_neon_uhadd: {
5714 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5715 IntNo == Intrinsic::aarch64_neon_shadd);
5716 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5717 IntNo == Intrinsic::aarch64_neon_urhadd);
5718 unsigned Opcode = IsSignedAdd
5719 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5720 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5721 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5722 Op.getOperand(2));
5723 }
5724 case Intrinsic::aarch64_neon_saddlp:
5725 case Intrinsic::aarch64_neon_uaddlp: {
5726 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5729 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5730 }
5731 case Intrinsic::aarch64_neon_sdot:
5732 case Intrinsic::aarch64_neon_udot:
5733 case Intrinsic::aarch64_sve_sdot:
5734 case Intrinsic::aarch64_sve_udot: {
5735 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5736 IntNo == Intrinsic::aarch64_sve_udot)
5739 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5740 Op.getOperand(2), Op.getOperand(3));
5741 }
5742 case Intrinsic::get_active_lane_mask: {
5743 SDValue ID =
5744 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5745 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5746 Op.getOperand(1), Op.getOperand(2));
5747 }
5748 case Intrinsic::aarch64_neon_uaddlv: {
5749 EVT OpVT = Op.getOperand(1).getValueType();
5750 EVT ResVT = Op.getValueType();
5751 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5752 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5753 // In order to avoid insert_subvector, used v4i32 than v2i32.
5754 SDValue UADDLV =
5755 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5756 SDValue EXTRACT_VEC_ELT =
5757 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5758 DAG.getConstant(0, dl, MVT::i64));
5759 return EXTRACT_VEC_ELT;
5760 }
5761 return SDValue();
5762 }
5763 case Intrinsic::experimental_cttz_elts: {
5764 SDValue NewCttzElts =
5765 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5766
5767 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5768 }
5769 }
5770}
5771
5772bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5773 if (VT.getVectorElementType() == MVT::i8 ||
5774 VT.getVectorElementType() == MVT::i16) {
5775 EltTy = MVT::i32;
5776 return true;
5777 }
5778 return false;
5779}
5780
5781bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5782 EVT DataVT) const {
5783 const EVT IndexVT = Extend.getOperand(0).getValueType();
5784 // SVE only supports implicit extension of 32-bit indices.
5785 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5786 return false;
5787
5788 // Indices cannot be smaller than the main data type.
5789 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5790 return false;
5791
5792 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5793 // element container type, which would violate the previous clause.
5794 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5795}
5796
5797bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5798 EVT ExtVT = ExtVal.getValueType();
5799 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5800 return false;
5801
5802 // It may be worth creating extending masked loads if there are multiple
5803 // masked loads using the same predicate. That way we'll end up creating
5804 // extending masked loads that may then get split by the legaliser. This
5805 // results in just one set of predicate unpacks at the start, instead of
5806 // multiple sets of vector unpacks after each load.
5807 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5808 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5809 // Disable extending masked loads for fixed-width for now, since the code
5810 // quality doesn't look great.
5811 if (!ExtVT.isScalableVector())
5812 return false;
5813
5814 unsigned NumExtMaskedLoads = 0;
5815 for (auto *U : Ld->getMask()->uses())
5816 if (isa<MaskedLoadSDNode>(U))
5817 NumExtMaskedLoads++;
5818
5819 if (NumExtMaskedLoads <= 1)
5820 return false;
5821 }
5822 }
5823
5824 return true;
5825}
5826
5827unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5828 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5829 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5831 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5833 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5835 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5837 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5839 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5841 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5843 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5845 };
5846 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5847 return AddrModes.find(Key)->second;
5848}
5849
5850unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5851 switch (Opcode) {
5852 default:
5853 llvm_unreachable("unimplemented opcode");
5854 return Opcode;
5869 }
5870}
5871
5872SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5873 SelectionDAG &DAG) const {
5874 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5875
5876 SDLoc DL(Op);
5877 SDValue Chain = MGT->getChain();
5878 SDValue PassThru = MGT->getPassThru();
5879 SDValue Mask = MGT->getMask();
5880 SDValue BasePtr = MGT->getBasePtr();
5881 SDValue Index = MGT->getIndex();
5882 SDValue Scale = MGT->getScale();
5883 EVT VT = Op.getValueType();
5884 EVT MemVT = MGT->getMemoryVT();
5885 ISD::LoadExtType ExtType = MGT->getExtensionType();
5886 ISD::MemIndexType IndexType = MGT->getIndexType();
5887
5888 // SVE supports zero (and so undef) passthrough values only, everything else
5889 // must be handled manually by an explicit select on the load's output.
5890 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5891 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5892 SDValue Load =
5893 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5894 MGT->getMemOperand(), IndexType, ExtType);
5895 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5896 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5897 }
5898
5899 bool IsScaled = MGT->isIndexScaled();
5900 bool IsSigned = MGT->isIndexSigned();
5901
5902 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5903 // must be calculated before hand.
5904 uint64_t ScaleVal = Scale->getAsZExtVal();
5905 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5906 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5907 EVT IndexVT = Index.getValueType();
5908 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5909 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5910 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5911
5912 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5913 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5914 MGT->getMemOperand(), IndexType, ExtType);
5915 }
5916
5917 // Lower fixed length gather to a scalable equivalent.
5918 if (VT.isFixedLengthVector()) {
5919 assert(Subtarget->useSVEForFixedLengthVectors() &&
5920 "Cannot lower when not using SVE for fixed vectors!");
5921
5922 // NOTE: Handle floating-point as if integer then bitcast the result.
5924 MemVT = MemVT.changeVectorElementTypeToInteger();
5925
5926 // Find the smallest integer fixed length vector we can use for the gather.
5927 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5928 if (DataVT.getVectorElementType() == MVT::i64 ||
5929 Index.getValueType().getVectorElementType() == MVT::i64 ||
5930 Mask.getValueType().getVectorElementType() == MVT::i64)
5931 PromotedVT = VT.changeVectorElementType(MVT::i64);
5932
5933 // Promote vector operands except for passthrough, which we know is either
5934 // undef or zero, and thus best constructed directly.
5935 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5936 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5937 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5938
5939 // A promoted result type forces the need for an extending load.
5940 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5941 ExtType = ISD::EXTLOAD;
5942
5943 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5944
5945 // Convert fixed length vector operands to scalable.
5946 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5947 Index = convertToScalableVector(DAG, ContainerVT, Index);
5949 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5950 : DAG.getConstant(0, DL, ContainerVT);
5951
5952 // Emit equivalent scalable vector gather.
5953 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5954 SDValue Load =
5955 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5956 Ops, MGT->getMemOperand(), IndexType, ExtType);
5957
5958 // Extract fixed length data then convert to the required result type.
5959 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5960 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5961 if (VT.isFloatingPoint())
5962 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5963
5964 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5965 }
5966
5967 // Everything else is legal.
5968 return Op;
5969}
5970
5971SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5972 SelectionDAG &DAG) const {
5973 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5974
5975 SDLoc DL(Op);
5976 SDValue Chain = MSC->getChain();
5977 SDValue StoreVal = MSC->getValue();
5978 SDValue Mask = MSC->getMask();
5979 SDValue BasePtr = MSC->getBasePtr();
5980 SDValue Index = MSC->getIndex();
5981 SDValue Scale = MSC->getScale();
5982 EVT VT = StoreVal.getValueType();
5983 EVT MemVT = MSC->getMemoryVT();
5984 ISD::MemIndexType IndexType = MSC->getIndexType();
5985 bool Truncating = MSC->isTruncatingStore();
5986
5987 bool IsScaled = MSC->isIndexScaled();
5988 bool IsSigned = MSC->isIndexSigned();
5989
5990 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5991 // must be calculated before hand.
5992 uint64_t ScaleVal = Scale->getAsZExtVal();
5993 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5994 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5995 EVT IndexVT = Index.getValueType();
5996 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5997 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5998 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5999
6000 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6001 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6002 MSC->getMemOperand(), IndexType, Truncating);
6003 }
6004
6005 // Lower fixed length scatter to a scalable equivalent.
6006 if (VT.isFixedLengthVector()) {
6007 assert(Subtarget->useSVEForFixedLengthVectors() &&
6008 "Cannot lower when not using SVE for fixed vectors!");
6009
6010 // Once bitcast we treat floating-point scatters as if integer.
6011 if (VT.isFloatingPoint()) {
6013 MemVT = MemVT.changeVectorElementTypeToInteger();
6014 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6015 }
6016
6017 // Find the smallest integer fixed length vector we can use for the scatter.
6018 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6019 if (VT.getVectorElementType() == MVT::i64 ||
6020 Index.getValueType().getVectorElementType() == MVT::i64 ||
6021 Mask.getValueType().getVectorElementType() == MVT::i64)
6022 PromotedVT = VT.changeVectorElementType(MVT::i64);
6023
6024 // Promote vector operands.
6025 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6026 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6027 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6028 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6029
6030 // A promoted value type forces the need for a truncating store.
6031 if (PromotedVT != VT)
6032 Truncating = true;
6033
6034 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6035
6036 // Convert fixed length vector operands to scalable.
6037 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6038 Index = convertToScalableVector(DAG, ContainerVT, Index);
6040 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6041
6042 // Emit equivalent scalable vector scatter.
6043 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6044 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6045 MSC->getMemOperand(), IndexType, Truncating);
6046 }
6047
6048 // Everything else is legal.
6049 return Op;
6050}
6051
6052SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6053 SDLoc DL(Op);
6054 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6055 assert(LoadNode && "Expected custom lowering of a masked load node");
6056 EVT VT = Op->getValueType(0);
6057
6058 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6059 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6060
6061 SDValue PassThru = LoadNode->getPassThru();
6062 SDValue Mask = LoadNode->getMask();
6063
6064 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6065 return Op;
6066
6068 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6069 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6070 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6071 LoadNode->getExtensionType());
6072
6073 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6074
6075 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6076}
6077
6078// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6080 EVT VT, EVT MemVT,
6081 SelectionDAG &DAG) {
6082 assert(VT.isVector() && "VT should be a vector type");
6083 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6084
6085 SDValue Value = ST->getValue();
6086
6087 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6088 // the word lane which represent the v4i8 subvector. It optimizes the store
6089 // to:
6090 //
6091 // xtn v0.8b, v0.8h
6092 // str s0, [x0]
6093
6094 SDValue Undef = DAG.getUNDEF(MVT::i16);
6095 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6096 {Undef, Undef, Undef, Undef});
6097
6098 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6099 Value, UndefVec);
6100 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6101
6102 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6103 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6104 Trunc, DAG.getConstant(0, DL, MVT::i64));
6105
6106 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6107 ST->getBasePtr(), ST->getMemOperand());
6108}
6109
6110// Custom lowering for any store, vector or scalar and/or default or with
6111// a truncate operations. Currently only custom lower truncate operation
6112// from vector v4i16 to v4i8 or volatile stores of i128.
6113SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6114 SelectionDAG &DAG) const {
6115 SDLoc Dl(Op);
6116 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6117 assert (StoreNode && "Can only custom lower store nodes");
6118
6119 SDValue Value = StoreNode->getValue();
6120
6121 EVT VT = Value.getValueType();
6122 EVT MemVT = StoreNode->getMemoryVT();
6123
6124 if (VT.isVector()) {
6126 VT,
6127 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6128 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6129
6130 unsigned AS = StoreNode->getAddressSpace();
6131 Align Alignment = StoreNode->getAlign();
6132 if (Alignment < MemVT.getStoreSize() &&
6133 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6134 StoreNode->getMemOperand()->getFlags(),
6135 nullptr)) {
6136 return scalarizeVectorStore(StoreNode, DAG);
6137 }
6138
6139 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6140 MemVT == MVT::v4i8) {
6141 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6142 }
6143 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6144 // the custom lowering, as there are no un-paired non-temporal stores and
6145 // legalization will break up 256 bit inputs.
6147 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6148 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6149 (MemVT.getScalarSizeInBits() == 8u ||
6150 MemVT.getScalarSizeInBits() == 16u ||
6151 MemVT.getScalarSizeInBits() == 32u ||
6152 MemVT.getScalarSizeInBits() == 64u)) {
6153 SDValue Lo =
6156 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6157 SDValue Hi =
6160 StoreNode->getValue(),
6161 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6163 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6164 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6165 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6166 return Result;
6167 }
6168 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6169 return LowerStore128(Op, DAG);
6170 } else if (MemVT == MVT::i64x8) {
6171 SDValue Value = StoreNode->getValue();
6172 assert(Value->getValueType(0) == MVT::i64x8);
6173 SDValue Chain = StoreNode->getChain();
6174 SDValue Base = StoreNode->getBasePtr();
6175 EVT PtrVT = Base.getValueType();
6176 for (unsigned i = 0; i < 8; i++) {
6177 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6178 Value, DAG.getConstant(i, Dl, MVT::i32));
6179 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6180 DAG.getConstant(i * 8, Dl, PtrVT));
6181 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6182 StoreNode->getOriginalAlign());
6183 }
6184 return Chain;
6185 }
6186
6187 return SDValue();
6188}
6189
6190/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6191SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6192 SelectionDAG &DAG) const {
6193 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6194 assert(StoreNode->getMemoryVT() == MVT::i128);
6195 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6196
6197 bool IsStoreRelease =
6199 if (StoreNode->isAtomic())
6200 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6201 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6204
6205 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6206 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6207 ? StoreNode->getOperand(1)
6208 : StoreNode->getOperand(2);
6209 SDLoc DL(Op);
6210 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6211 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6212 if (DAG.getDataLayout().isBigEndian())
6213 std::swap(StoreValue.first, StoreValue.second);
6215 Opcode, DL, DAG.getVTList(MVT::Other),
6216 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6217 StoreNode->getBasePtr()},
6218 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6219 return Result;
6220}
6221
6222SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6223 SelectionDAG &DAG) const {
6224 SDLoc DL(Op);
6225 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6226 assert(LoadNode && "Expected custom lowering of a load node");
6227
6228 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6230 SDValue Base = LoadNode->getBasePtr();
6231 SDValue Chain = LoadNode->getChain();
6232 EVT PtrVT = Base.getValueType();
6233 for (unsigned i = 0; i < 8; i++) {
6234 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6235 DAG.getConstant(i * 8, DL, PtrVT));
6236 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6237 LoadNode->getPointerInfo(),
6238 LoadNode->getOriginalAlign());
6239 Ops.push_back(Part);
6240 Chain = SDValue(Part.getNode(), 1);
6241 }
6242 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6243 return DAG.getMergeValues({Loaded, Chain}, DL);
6244 }
6245
6246 // Custom lowering for extending v4i8 vector loads.
6247 EVT VT = Op->getValueType(0);
6248 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6249
6250 if (LoadNode->getMemoryVT() != MVT::v4i8)
6251 return SDValue();
6252
6253 unsigned ExtType;
6254 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6255 ExtType = ISD::SIGN_EXTEND;
6256 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6257 LoadNode->getExtensionType() == ISD::EXTLOAD)
6258 ExtType = ISD::ZERO_EXTEND;
6259 else
6260 return SDValue();
6261
6262 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6263 LoadNode->getBasePtr(), MachinePointerInfo());
6264 SDValue Chain = Load.getValue(1);
6265 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6266 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6267 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6268 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6269 DAG.getConstant(0, DL, MVT::i64));
6270 if (VT == MVT::v4i32)
6271 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6272 return DAG.getMergeValues({Ext, Chain}, DL);
6273}
6274
6275// Generate SUBS and CSEL for integer abs.
6276SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6277 MVT VT = Op.getSimpleValueType();
6278
6279 if (VT.isVector())
6280 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6281
6282 SDLoc DL(Op);
6283 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6284 Op.getOperand(0));
6285 // Generate SUBS & CSEL.
6286 SDValue Cmp =
6287 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6288 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6289 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6290 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6291 Cmp.getValue(1));
6292}
6293
6295 SDValue Chain = Op.getOperand(0);
6296 SDValue Cond = Op.getOperand(1);
6297 SDValue Dest = Op.getOperand(2);
6298
6300 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6301 SDLoc dl(Op);
6302 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6303 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6304 Cmp);
6305 }
6306
6307 return SDValue();
6308}
6309
6310// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6311// FSHL is converted to FSHR before deciding what to do with it
6313 SDValue Shifts = Op.getOperand(2);
6314 // Check if the shift amount is a constant
6315 // If opcode is FSHL, convert it to FSHR
6316 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6317 SDLoc DL(Op);
6318 MVT VT = Op.getSimpleValueType();
6319
6320 if (Op.getOpcode() == ISD::FSHL) {
6321 unsigned int NewShiftNo =
6322 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6323 return DAG.getNode(
6324 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6325 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6326 } else if (Op.getOpcode() == ISD::FSHR) {
6327 return Op;
6328 }
6329 }
6330
6331 return SDValue();
6332}
6333
6335 SDValue X = Op.getOperand(0);
6336 EVT XScalarTy = X.getValueType();
6337 SDValue Exp = Op.getOperand(1);
6338
6339 SDLoc DL(Op);
6340 EVT XVT, ExpVT;
6341 switch (Op.getSimpleValueType().SimpleTy) {
6342 default:
6343 return SDValue();
6344 case MVT::bf16:
6345 case MVT::f16:
6346 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6347 [[fallthrough]];
6348 case MVT::f32:
6349 XVT = MVT::nxv4f32;
6350 ExpVT = MVT::nxv4i32;
6351 break;
6352 case MVT::f64:
6353 XVT = MVT::nxv2f64;
6354 ExpVT = MVT::nxv2i64;
6355 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6356 break;
6357 }
6358
6359 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6360 SDValue VX =
6361 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6362 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6363 DAG.getUNDEF(ExpVT), Exp, Zero);
6364 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6365 AArch64SVEPredPattern::all);
6366 SDValue FScale =
6368 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6369 VPg, VX, VExp);
6370 SDValue Final =
6371 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6372 if (X.getValueType() != XScalarTy)
6373 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6374 DAG.getIntPtrConstant(1, SDLoc(Op)));
6375 return Final;
6376}
6377
6379 SelectionDAG &DAG) const {
6380 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6381 LLVM_DEBUG(Op.dump());
6382
6383 switch (Op.getOpcode()) {
6384 default:
6385 llvm_unreachable("unimplemented operand");
6386 return SDValue();
6387 case ISD::BITCAST:
6388 return LowerBITCAST(Op, DAG);
6389 case ISD::GlobalAddress:
6390 return LowerGlobalAddress(Op, DAG);
6392 return LowerGlobalTLSAddress(Op, DAG);
6393 case ISD::SETCC:
6394 case ISD::STRICT_FSETCC:
6396 return LowerSETCC(Op, DAG);
6397 case ISD::SETCCCARRY:
6398 return LowerSETCCCARRY(Op, DAG);
6399 case ISD::BRCOND:
6400 return LowerBRCOND(Op, DAG);
6401 case ISD::BR_CC:
6402 return LowerBR_CC(Op, DAG);
6403 case ISD::SELECT:
6404 return LowerSELECT(Op, DAG);
6405 case ISD::SELECT_CC:
6406 return LowerSELECT_CC(Op, DAG);
6407 case ISD::JumpTable:
6408 return LowerJumpTable(Op, DAG);
6409 case ISD::BR_JT:
6410 return LowerBR_JT(Op, DAG);
6411 case ISD::ConstantPool:
6412 return LowerConstantPool(Op, DAG);
6413 case ISD::BlockAddress:
6414 return LowerBlockAddress(Op, DAG);
6415 case ISD::VASTART:
6416 return LowerVASTART(Op, DAG);
6417 case ISD::VACOPY:
6418 return LowerVACOPY(Op, DAG);
6419 case ISD::VAARG:
6420 return LowerVAARG(Op, DAG);
6421 case ISD::UADDO_CARRY:
6422 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6423 case ISD::USUBO_CARRY:
6424 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6425 case ISD::SADDO_CARRY:
6426 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6427 case ISD::SSUBO_CARRY:
6428 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6429 case ISD::SADDO:
6430 case ISD::UADDO:
6431 case ISD::SSUBO:
6432 case ISD::USUBO:
6433 case ISD::SMULO:
6434 case ISD::UMULO:
6435 return LowerXALUO(Op, DAG);
6436 case ISD::FADD:
6437 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6438 case ISD::FSUB:
6439 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6440 case ISD::FMUL:
6441 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6442 case ISD::FMA:
6443 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6444 case ISD::FDIV:
6445 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6446 case ISD::FNEG:
6447 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6448 case ISD::FCEIL:
6449 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6450 case ISD::FFLOOR:
6451 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6452 case ISD::FNEARBYINT:
6453 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6454 case ISD::FRINT:
6455 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6456 case ISD::FROUND:
6457 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6458 case ISD::FROUNDEVEN:
6459 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6460 case ISD::FTRUNC:
6461 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6462 case ISD::FSQRT:
6463 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6464 case ISD::FABS:
6465 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6466 case ISD::FP_ROUND:
6468 return LowerFP_ROUND(Op, DAG);
6469 case ISD::FP_EXTEND:
6470 return LowerFP_EXTEND(Op, DAG);
6471 case ISD::FRAMEADDR:
6472 return LowerFRAMEADDR(Op, DAG);
6473 case ISD::SPONENTRY:
6474 return LowerSPONENTRY(Op, DAG);
6475 case ISD::RETURNADDR:
6476 return LowerRETURNADDR(Op, DAG);
6478 return LowerADDROFRETURNADDR(Op, DAG);
6480 return LowerCONCAT_VECTORS(Op, DAG);
6482 return LowerINSERT_VECTOR_ELT(Op, DAG);
6484 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6485 case ISD::BUILD_VECTOR:
6486 return LowerBUILD_VECTOR(Op, DAG);
6488 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6490 return LowerVECTOR_SHUFFLE(Op, DAG);
6491 case ISD::SPLAT_VECTOR:
6492 return LowerSPLAT_VECTOR(Op, DAG);
6494 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6496 return LowerINSERT_SUBVECTOR(Op, DAG);
6497 case ISD::SDIV:
6498 case ISD::UDIV:
6499 return LowerDIV(Op, DAG);
6500 case ISD::SMIN:
6501 case ISD::UMIN:
6502 case ISD::SMAX:
6503 case ISD::UMAX:
6504 return LowerMinMax(Op, DAG);
6505 case ISD::SRA:
6506 case ISD::SRL:
6507 case ISD::SHL:
6508 return LowerVectorSRA_SRL_SHL(Op, DAG);
6509 case ISD::SHL_PARTS:
6510 case ISD::SRL_PARTS:
6511 case ISD::SRA_PARTS:
6512 return LowerShiftParts(Op, DAG);
6513 case ISD::CTPOP:
6514 case ISD::PARITY:
6515 return LowerCTPOP_PARITY(Op, DAG);
6516 case ISD::FCOPYSIGN:
6517 return LowerFCOPYSIGN(Op, DAG);
6518 case ISD::OR:
6519 return LowerVectorOR(Op, DAG);
6520 case ISD::XOR:
6521 return LowerXOR(Op, DAG);
6522 case ISD::PREFETCH:
6523 return LowerPREFETCH(Op, DAG);
6524 case ISD::SINT_TO_FP:
6525 case ISD::UINT_TO_FP:
6528 return LowerINT_TO_FP(Op, DAG);
6529 case ISD::FP_TO_SINT:
6530 case ISD::FP_TO_UINT:
6533 return LowerFP_TO_INT(Op, DAG);
6536 return LowerFP_TO_INT_SAT(Op, DAG);
6537 case ISD::FSINCOS:
6538 return LowerFSINCOS(Op, DAG);
6539 case ISD::GET_ROUNDING:
6540 return LowerGET_ROUNDING(Op, DAG);
6541 case ISD::SET_ROUNDING:
6542 return LowerSET_ROUNDING(Op, DAG);
6543 case ISD::MUL:
6544 return LowerMUL(Op, DAG);
6545 case ISD::MULHS:
6546 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6547 case ISD::MULHU:
6548 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6550 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6552 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6554 return LowerINTRINSIC_VOID(Op, DAG);
6555 case ISD::ATOMIC_STORE:
6556 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6557 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6558 return LowerStore128(Op, DAG);
6559 }
6560 return SDValue();
6561 case ISD::STORE:
6562 return LowerSTORE(Op, DAG);
6563 case ISD::MSTORE:
6564 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6565 case ISD::MGATHER:
6566 return LowerMGATHER(Op, DAG);
6567 case ISD::MSCATTER:
6568 return LowerMSCATTER(Op, DAG);
6570 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6571 case ISD::VECREDUCE_ADD:
6572 case ISD::VECREDUCE_AND:
6573 case ISD::VECREDUCE_OR:
6574 case ISD::VECREDUCE_XOR:
6584 return LowerVECREDUCE(Op, DAG);
6586 return LowerATOMIC_LOAD_AND(Op, DAG);
6588 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6589 case ISD::VSCALE:
6590 return LowerVSCALE(Op, DAG);
6591 case ISD::ANY_EXTEND:
6592 case ISD::SIGN_EXTEND:
6593 case ISD::ZERO_EXTEND:
6594 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6596 // Only custom lower when ExtraVT has a legal byte based element type.
6597 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6598 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6599 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6600 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6601 return SDValue();
6602
6603 return LowerToPredicatedOp(Op, DAG,
6605 }
6606 case ISD::TRUNCATE:
6607 return LowerTRUNCATE(Op, DAG);
6608 case ISD::MLOAD:
6609 return LowerMLOAD(Op, DAG);
6610 case ISD::LOAD:
6611 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6612 !Subtarget->isNeonAvailable()))
6613 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6614 return LowerLOAD(Op, DAG);
6615 case ISD::ADD:
6616 case ISD::AND:
6617 case ISD::SUB:
6618 return LowerToScalableOp(Op, DAG);
6619 case ISD::FMAXIMUM:
6620 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6621 case ISD::FMAXNUM:
6622 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6623 case ISD::FMINIMUM:
6624 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6625 case ISD::FMINNUM:
6626 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6627 case ISD::VSELECT:
6628 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6629 case ISD::ABS:
6630 return LowerABS(Op, DAG);
6631 case ISD::ABDS:
6632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6633 case ISD::ABDU:
6634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6635 case ISD::AVGFLOORS:
6636 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6637 case ISD::AVGFLOORU:
6638 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6639 case ISD::AVGCEILS:
6640 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6641 case ISD::AVGCEILU:
6642 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6643 case ISD::BITREVERSE:
6644 return LowerBitreverse(Op, DAG);
6645 case ISD::BSWAP:
6646 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6647 case ISD::CTLZ:
6648 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6649 case ISD::CTTZ:
6650 return LowerCTTZ(Op, DAG);
6651 case ISD::VECTOR_SPLICE:
6652 return LowerVECTOR_SPLICE(Op, DAG);
6654 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6656 return LowerVECTOR_INTERLEAVE(Op, DAG);
6657 case ISD::LROUND:
6658 case ISD::LLROUND:
6659 case ISD::LRINT:
6660 case ISD::LLRINT: {
6661 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6662 Op.getOperand(0).getValueType() == MVT::bf16) &&
6663 "Expected custom lowering of rounding operations only for f16");
6664 SDLoc DL(Op);
6665 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6666 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6667 }
6668 case ISD::STRICT_LROUND:
6670 case ISD::STRICT_LRINT:
6671 case ISD::STRICT_LLRINT: {
6672 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6673 Op.getOperand(1).getValueType() == MVT::bf16) &&
6674 "Expected custom lowering of rounding operations only for f16");
6675 SDLoc DL(Op);
6676 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6677 {Op.getOperand(0), Op.getOperand(1)});
6678 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6679 {Ext.getValue(1), Ext.getValue(0)});
6680 }
6681 case ISD::WRITE_REGISTER: {
6682 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6683 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6684 SDLoc DL(Op);
6685
6686 SDValue Chain = Op.getOperand(0);
6687 SDValue SysRegName = Op.getOperand(1);
6688 std::pair<SDValue, SDValue> Pair =
6689 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6690
6691 // chain = MSRR(chain, sysregname, lo, hi)
6692 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6693 SysRegName, Pair.first, Pair.second);
6694
6695 return Result;
6696 }
6697 case ISD::FSHL:
6698 case ISD::FSHR:
6699 return LowerFunnelShift(Op, DAG);
6700 case ISD::FLDEXP:
6701 return LowerFLDEXP(Op, DAG);
6702 }
6703}
6704
6706 return !Subtarget->useSVEForFixedLengthVectors();
6707}
6708
6710 EVT VT, bool OverrideNEON) const {
6711 if (!VT.isFixedLengthVector() || !VT.isSimple())
6712 return false;
6713
6714 // Don't use SVE for vectors we cannot scalarize if required.
6715 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6716 // Fixed length predicates should be promoted to i8.
6717 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6718 case MVT::i1:
6719 default:
6720 return false;
6721 case MVT::i8:
6722 case MVT::i16:
6723 case MVT::i32:
6724 case MVT::i64:
6725 case MVT::f16:
6726 case MVT::f32:
6727 case MVT::f64:
6728 break;
6729 }
6730
6731 // NEON-sized vectors can be emulated using SVE instructions.
6732 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6733 return Subtarget->hasSVEorSME();
6734
6735 // Ensure NEON MVTs only belong to a single register class.
6736 if (VT.getFixedSizeInBits() <= 128)
6737 return false;
6738
6739 // Ensure wider than NEON code generation is enabled.
6740 if (!Subtarget->useSVEForFixedLengthVectors())
6741 return false;
6742
6743 // Don't use SVE for types that don't fit.
6744 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6745 return false;
6746
6747 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6748 // the base fixed length SVE support in place.
6749 if (!VT.isPow2VectorType())
6750 return false;
6751
6752 return true;
6753}
6754
6755//===----------------------------------------------------------------------===//
6756// Calling Convention Implementation
6757//===----------------------------------------------------------------------===//
6758
6759static unsigned getIntrinsicID(const SDNode *N) {
6760 unsigned Opcode = N->getOpcode();
6761 switch (Opcode) {
6762 default:
6765 unsigned IID = N->getConstantOperandVal(0);
6766 if (IID < Intrinsic::num_intrinsics)
6767 return IID;
6769 }
6770 }
6771}
6772
6774 SDValue N1) const {
6775 if (!N0.hasOneUse())
6776 return false;
6777
6778 unsigned IID = getIntrinsicID(N1.getNode());
6779 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6780 if (IID == Intrinsic::aarch64_neon_umull ||
6781 N1.getOpcode() == AArch64ISD::UMULL ||
6782 IID == Intrinsic::aarch64_neon_smull ||
6784 return N0.getOpcode() != ISD::ADD;
6785
6786 return true;
6787}
6788
6789/// Selects the correct CCAssignFn for a given CallingConvention value.
6791 bool IsVarArg) const {
6792 switch (CC) {
6793 default:
6794 report_fatal_error("Unsupported calling convention.");
6795 case CallingConv::GHC:
6796 return CC_AArch64_GHC;
6797 case CallingConv::C:
6798 case CallingConv::Fast:
6802 case CallingConv::Swift:
6804 case CallingConv::Tail:
6805 case CallingConv::GRAAL:
6806 if (Subtarget->isTargetWindows()) {
6807 if (IsVarArg) {
6808 if (Subtarget->isWindowsArm64EC())
6811 }
6812 return CC_AArch64_Win64PCS;
6813 }
6814 if (!Subtarget->isTargetDarwin())
6815 return CC_AArch64_AAPCS;
6816 if (!IsVarArg)
6817 return CC_AArch64_DarwinPCS;
6820 case CallingConv::Win64:
6821 if (IsVarArg) {
6822 if (Subtarget->isWindowsArm64EC())
6825 }
6826 return CC_AArch64_Win64PCS;
6828 if (Subtarget->isWindowsArm64EC())
6835 return CC_AArch64_AAPCS;
6840 }
6841}
6842
6843CCAssignFn *
6845 switch (CC) {
6846 default:
6847 return RetCC_AArch64_AAPCS;
6851 if (Subtarget->isWindowsArm64EC())
6853 return RetCC_AArch64_AAPCS;
6854 }
6855}
6856
6857
6858unsigned
6859AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6860 SelectionDAG &DAG) const {
6862 MachineFrameInfo &MFI = MF.getFrameInfo();
6863
6864 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6865 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6866 DAG.getConstant(1, DL, MVT::i32));
6867 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6868 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6869 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6870 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6871 Chain = Buffer.getValue(1);
6872 MFI.CreateVariableSizedObject(Align(1), nullptr);
6873
6874 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6875 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6876
6877 // Store the buffer pointer to the TPIDR2 stack object.
6880 TPIDR2Obj,
6882 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6883
6884 // Set the reserved bytes (10-15) to zero
6885 EVT PtrTy = Ptr.getValueType();
6886 SDValue ReservedPtr =
6887 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6888 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6889 MPI);
6890 ReservedPtr =
6891 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6892 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6893 MPI);
6894
6895 return TPIDR2Obj;
6896}
6897
6898static bool isPassedInFPR(EVT VT) {
6899 return VT.isFixedLengthVector() ||
6900 (VT.isFloatingPoint() && !VT.isScalableVector());
6901}
6902
6903SDValue AArch64TargetLowering::LowerFormalArguments(
6904 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6905 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6906 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6908 const Function &F = MF.getFunction();
6909 MachineFrameInfo &MFI = MF.getFrameInfo();
6910 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6911 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6912 (isVarArg && Subtarget->isWindowsArm64EC());
6914
6916 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6918 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6919 FuncInfo->setIsSVECC(true);
6920
6921 // Assign locations to all of the incoming arguments.
6923 DenseMap<unsigned, SDValue> CopiedRegs;
6924 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6925
6926 // At this point, Ins[].VT may already be promoted to i32. To correctly
6927 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6928 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6929 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6930 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6931 // LocVT.
6932 unsigned NumArgs = Ins.size();
6933 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6934 unsigned CurArgIdx = 0;
6935 for (unsigned i = 0; i != NumArgs; ++i) {
6936 MVT ValVT = Ins[i].VT;
6937 if (Ins[i].isOrigArg()) {
6938 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6939 CurArgIdx = Ins[i].getOrigArgIndex();
6940
6941 // Get type of the original argument.
6942 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6943 /*AllowUnknown*/ true);
6944 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6945 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6946 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6947 ValVT = MVT::i8;
6948 else if (ActualMVT == MVT::i16)
6949 ValVT = MVT::i16;
6950 }
6951 bool UseVarArgCC = false;
6952 if (IsWin64)
6953 UseVarArgCC = isVarArg;
6954 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6955 bool Res =
6956 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6957 assert(!Res && "Call operand has unhandled type");
6958 (void)Res;
6959 }
6960
6962 bool IsLocallyStreaming =
6963 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6964 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6965 SDValue Glue = Chain.getValue(1);
6966
6967 SmallVector<SDValue, 16> ArgValues;
6968 unsigned ExtraArgLocs = 0;
6969 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6970 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6971
6972 if (Ins[i].Flags.isByVal()) {
6973 // Byval is used for HFAs in the PCS, but the system should work in a
6974 // non-compliant manner for larger structs.
6975 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6976 int Size = Ins[i].Flags.getByValSize();
6977 unsigned NumRegs = (Size + 7) / 8;
6978
6979 // FIXME: This works on big-endian for composite byvals, which are the common
6980 // case. It should also work for fundamental types too.
6981 unsigned FrameIdx =
6982 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6983 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6984 InVals.push_back(FrameIdxN);
6985
6986 continue;
6987 }
6988
6989 if (Ins[i].Flags.isSwiftAsync())
6991
6992 SDValue ArgValue;
6993 if (VA.isRegLoc()) {
6994 // Arguments stored in registers.
6995 EVT RegVT = VA.getLocVT();
6996 const TargetRegisterClass *RC;
6997
6998 if (RegVT == MVT::i32)
6999 RC = &AArch64::GPR32RegClass;
7000 else if (RegVT == MVT::i64)
7001 RC = &AArch64::GPR64RegClass;
7002 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7003 RC = &AArch64::FPR16RegClass;
7004 else if (RegVT == MVT::f32)
7005 RC = &AArch64::FPR32RegClass;
7006 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7007 RC = &AArch64::FPR64RegClass;
7008 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7009 RC = &AArch64::FPR128RegClass;
7010 else if (RegVT.isScalableVector() &&
7011 RegVT.getVectorElementType() == MVT::i1) {
7012 FuncInfo->setIsSVECC(true);
7013 RC = &AArch64::PPRRegClass;
7014 } else if (RegVT == MVT::aarch64svcount) {
7015 FuncInfo->setIsSVECC(true);
7016 RC = &AArch64::PPRRegClass;
7017 } else if (RegVT.isScalableVector()) {
7018 FuncInfo->setIsSVECC(true);
7019 RC = &AArch64::ZPRRegClass;
7020 } else
7021 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7022
7023 // Transform the arguments in physical registers into virtual ones.
7024 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7025
7026 if (IsLocallyStreaming) {
7027 // LocallyStreamingFunctions must insert the SMSTART in the correct
7028 // position, so we use Glue to ensure no instructions can be scheduled
7029 // between the chain of:
7030 // t0: ch,glue = EntryNode
7031 // t1: res,ch,glue = CopyFromReg
7032 // ...
7033 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7034 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7035 // ^^^^^^
7036 // This will be the new Chain/Root node.
7037 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7038 Glue = ArgValue.getValue(2);
7039 if (isPassedInFPR(ArgValue.getValueType())) {
7040 ArgValue =
7042 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7043 {ArgValue, Glue});
7044 Glue = ArgValue.getValue(1);
7045 }
7046 } else
7047 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7048
7049 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7050 // to 64 bits. Insert an assert[sz]ext to capture this, then
7051 // truncate to the right size.
7052 switch (VA.getLocInfo()) {
7053 default:
7054 llvm_unreachable("Unknown loc info!");
7055 case CCValAssign::Full:
7056 break;
7058 assert(
7059 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7060 "Indirect arguments should be scalable on most subtargets");
7061 break;
7062 case CCValAssign::BCvt:
7063 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7064 break;
7065 case CCValAssign::AExt:
7066 case CCValAssign::SExt:
7067 case CCValAssign::ZExt:
7068 break;
7070 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7071 DAG.getConstant(32, DL, RegVT));
7072 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7073 break;
7074 }
7075 } else { // VA.isRegLoc()
7076 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7077 unsigned ArgOffset = VA.getLocMemOffset();
7078 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7079 ? VA.getLocVT().getSizeInBits()
7080 : VA.getValVT().getSizeInBits()) / 8;
7081
7082 uint32_t BEAlign = 0;
7083 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7084 !Ins[i].Flags.isInConsecutiveRegs())
7085 BEAlign = 8 - ArgSize;
7086
7087 SDValue FIN;
7088 MachinePointerInfo PtrInfo;
7089 if (StackViaX4) {
7090 // In both the ARM64EC varargs convention and the thunk convention,
7091 // arguments on the stack are accessed relative to x4, not sp. In
7092 // the thunk convention, there's an additional offset of 32 bytes
7093 // to account for the shadow store.
7094 unsigned ObjOffset = ArgOffset + BEAlign;
7095 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7096 ObjOffset += 32;
7097 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7098 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7099 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7100 DAG.getConstant(ObjOffset, DL, MVT::i64));
7102 } else {
7103 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7104
7105 // Create load nodes to retrieve arguments from the stack.
7106 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7107 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7108 }
7109
7110 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7112 MVT MemVT = VA.getValVT();
7113
7114 switch (VA.getLocInfo()) {
7115 default:
7116 break;
7117 case CCValAssign::Trunc:
7118 case CCValAssign::BCvt:
7119 MemVT = VA.getLocVT();
7120 break;
7123 Subtarget->isWindowsArm64EC()) &&
7124 "Indirect arguments should be scalable on most subtargets");
7125 MemVT = VA.getLocVT();
7126 break;
7127 case CCValAssign::SExt:
7128 ExtType = ISD::SEXTLOAD;
7129 break;
7130 case CCValAssign::ZExt:
7131 ExtType = ISD::ZEXTLOAD;
7132 break;
7133 case CCValAssign::AExt:
7134 ExtType = ISD::EXTLOAD;
7135 break;
7136 }
7137
7138 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7139 MemVT);
7140 }
7141
7142 if (VA.getLocInfo() == CCValAssign::Indirect) {
7143 assert((VA.getValVT().isScalableVT() ||
7144 Subtarget->isWindowsArm64EC()) &&
7145 "Indirect arguments should be scalable on most subtargets");
7146
7147 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7148 unsigned NumParts = 1;
7149 if (Ins[i].Flags.isInConsecutiveRegs()) {
7150 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7151 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7152 ++NumParts;
7153 }
7154
7155 MVT PartLoad = VA.getValVT();
7156 SDValue Ptr = ArgValue;
7157
7158 // Ensure we generate all loads for each tuple part, whilst updating the
7159 // pointer after each load correctly using vscale.
7160 while (NumParts > 0) {
7161 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7162 InVals.push_back(ArgValue);
7163 NumParts--;
7164 if (NumParts > 0) {
7165 SDValue BytesIncrement;
7166 if (PartLoad.isScalableVector()) {
7167 BytesIncrement = DAG.getVScale(
7168 DL, Ptr.getValueType(),
7169 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7170 } else {
7171 BytesIncrement = DAG.getConstant(
7172 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7173 Ptr.getValueType());
7174 }
7176 Flags.setNoUnsignedWrap(true);
7177 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7178 BytesIncrement, Flags);
7179 ExtraArgLocs++;
7180 i++;
7181 }
7182 }
7183 } else {
7184 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7185 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7186 ArgValue, DAG.getValueType(MVT::i32));
7187
7188 // i1 arguments are zero-extended to i8 by the caller. Emit a
7189 // hint to reflect this.
7190 if (Ins[i].isOrigArg()) {
7191 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7192 if (OrigArg->getType()->isIntegerTy(1)) {
7193 if (!Ins[i].Flags.isZExt()) {
7194 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7195 ArgValue.getValueType(), ArgValue);
7196 }
7197 }
7198 }
7199
7200 InVals.push_back(ArgValue);
7201 }
7202 }
7203 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7204
7205 // Insert the SMSTART if this is a locally streaming function and
7206 // make sure it is Glued to the last CopyFromReg value.
7207 if (IsLocallyStreaming) {
7208 SDValue PStateSM;
7209 if (Attrs.hasStreamingCompatibleInterface()) {
7210 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7213 FuncInfo->setPStateSMReg(Reg);
7214 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7215 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7217 } else
7218 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7220
7221 // Ensure that the SMSTART happens after the CopyWithChain such that its
7222 // chain result is used.
7223 for (unsigned I=0; I<InVals.size(); ++I) {
7225 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7226 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7227 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7228 InVals[I].getValueType());
7229 }
7230 }
7231
7232 // varargs
7233 if (isVarArg) {
7234 if (!Subtarget->isTargetDarwin() || IsWin64) {
7235 // The AAPCS variadic function ABI is identical to the non-variadic
7236 // one. As a result there may be more arguments in registers and we should
7237 // save them for future reference.
7238 // Win64 variadic functions also pass arguments in registers, but all float
7239 // arguments are passed in integer registers.
7240 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7241 }
7242
7243 // This will point to the next argument passed via stack.
7244 unsigned VarArgsOffset = CCInfo.getStackSize();
7245 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7246 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7247 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7248 FuncInfo->setVarArgsStackIndex(
7249 MFI.CreateFixedObject(4, VarArgsOffset, true));
7250
7251 if (MFI.hasMustTailInVarArgFunc()) {
7252 SmallVector<MVT, 2> RegParmTypes;
7253 RegParmTypes.push_back(MVT::i64);
7254 RegParmTypes.push_back(MVT::f128);
7255 // Compute the set of forwarded registers. The rest are scratch.
7257 FuncInfo->getForwardedMustTailRegParms();
7258 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7260
7261 // Conservatively forward X8, since it might be used for aggregate return.
7262 if (!CCInfo.isAllocated(AArch64::X8)) {
7263 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7264 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7265 }
7266 }
7267 }
7268
7269 // On Windows, InReg pointers must be returned, so record the pointer in a
7270 // virtual register at the start of the function so it can be returned in the
7271 // epilogue.
7272 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7273 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7274 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7275 Ins[I].Flags.isInReg()) &&
7276 Ins[I].Flags.isSRet()) {
7277 assert(!FuncInfo->getSRetReturnReg());
7278
7279 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7280 Register Reg =
7282 FuncInfo->setSRetReturnReg(Reg);
7283
7284 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7285 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7286 break;
7287 }
7288 }
7289 }
7290
7291 unsigned StackArgSize = CCInfo.getStackSize();
7292 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7293 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7294 // This is a non-standard ABI so by fiat I say we're allowed to make full
7295 // use of the stack area to be popped, which must be aligned to 16 bytes in
7296 // any case:
7297 StackArgSize = alignTo(StackArgSize, 16);
7298
7299 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7300 // a multiple of 16.
7301 FuncInfo->setArgumentStackToRestore(StackArgSize);
7302
7303 // This realignment carries over to the available bytes below. Our own
7304 // callers will guarantee the space is free by giving an aligned value to
7305 // CALLSEQ_START.
7306 }
7307 // Even if we're not expected to free up the space, it's useful to know how
7308 // much is there while considering tail calls (because we can reuse it).
7309 FuncInfo->setBytesInStackArgArea(StackArgSize);
7310
7311 if (Subtarget->hasCustomCallingConv())
7313
7314 // Conservatively assume the function requires the lazy-save mechanism.
7315 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7316 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7317 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7318 }
7319
7320 return Chain;
7321}
7322
7323void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7324 SelectionDAG &DAG,
7325 const SDLoc &DL,
7326 SDValue &Chain) const {
7328 MachineFrameInfo &MFI = MF.getFrameInfo();
7330 auto PtrVT = getPointerTy(DAG.getDataLayout());
7331 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7332
7334
7336 unsigned NumGPRArgRegs = GPRArgRegs.size();
7337 if (Subtarget->isWindowsArm64EC()) {
7338 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7339 // functions.
7340 NumGPRArgRegs = 4;
7341 }
7342 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7343
7344 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7345 int GPRIdx = 0;
7346 if (GPRSaveSize != 0) {
7347 if (IsWin64) {
7348 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7349 if (GPRSaveSize & 15)
7350 // The extra size here, if triggered, will always be 8.
7351 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7352 } else
7353 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7354
7355 SDValue FIN;
7356 if (Subtarget->isWindowsArm64EC()) {
7357 // With the Arm64EC ABI, we reserve the save area as usual, but we
7358 // compute its address relative to x4. For a normal AArch64->AArch64
7359 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7360 // different address.
7361 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7362 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7363 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7364 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7365 } else {
7366 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7367 }
7368
7369 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7370 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7371 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7372 SDValue Store =
7373 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7375 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7376 : MachinePointerInfo::getStack(MF, i * 8));
7377 MemOps.push_back(Store);
7378 FIN =
7379 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7380 }
7381 }
7382 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7383 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7384
7385 if (Subtarget->hasFPARMv8() && !IsWin64) {
7387 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7388 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7389
7390 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7391 int FPRIdx = 0;
7392 if (FPRSaveSize != 0) {
7393 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7394
7395 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7396
7397 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7398 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7399 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7400
7401 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7402 MachinePointerInfo::getStack(MF, i * 16));
7403 MemOps.push_back(Store);
7404 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7405 DAG.getConstant(16, DL, PtrVT));
7406 }
7407 }
7408 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7409 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7410 }
7411
7412 if (!MemOps.empty()) {
7413 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7414 }
7415}
7416
7417/// LowerCallResult - Lower the result values of a call into the
7418/// appropriate copies out of appropriate physical registers.
7419SDValue AArch64TargetLowering::LowerCallResult(
7420 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7421 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7422 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7423 SDValue ThisVal, bool RequiresSMChange) const {
7424 DenseMap<unsigned, SDValue> CopiedRegs;
7425 // Copy all of the result registers out of their specified physreg.
7426 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7427 CCValAssign VA = RVLocs[i];
7428
7429 // Pass 'this' value directly from the argument to return value, to avoid
7430 // reg unit interference
7431 if (i == 0 && isThisReturn) {
7432 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7433 "unexpected return calling convention register assignment");
7434 InVals.push_back(ThisVal);
7435 continue;
7436 }
7437
7438 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7439 // allows one use of a physreg per block.
7440 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7441 if (!Val) {
7442 Val =
7443 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7444 Chain = Val.getValue(1);
7445 InGlue = Val.getValue(2);
7446 CopiedRegs[VA.getLocReg()] = Val;
7447 }
7448
7449 switch (VA.getLocInfo()) {
7450 default:
7451 llvm_unreachable("Unknown loc info!");
7452 case CCValAssign::Full:
7453 break;
7454 case CCValAssign::BCvt:
7455 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7456 break;
7458 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7459 DAG.getConstant(32, DL, VA.getLocVT()));
7460 [[fallthrough]];
7461 case CCValAssign::AExt:
7462 [[fallthrough]];
7463 case CCValAssign::ZExt:
7464 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7465 break;
7466 }
7467
7468 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7470 Val);
7471
7472 InVals.push_back(Val);
7473 }
7474
7475 return Chain;
7476}
7477
7478/// Return true if the calling convention is one that we can guarantee TCO for.
7479static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7480 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7482}
7483
7484/// Return true if we might ever do TCO for calls with this calling convention.
7486 switch (CC) {
7487 case CallingConv::C:
7491 case CallingConv::Swift:
7493 case CallingConv::Tail:
7494 case CallingConv::Fast:
7495 return true;
7496 default:
7497 return false;
7498 }
7499}
7500
7502 const AArch64Subtarget *Subtarget,
7504 CCState &CCInfo) {
7505 const SelectionDAG &DAG = CLI.DAG;
7506 CallingConv::ID CalleeCC = CLI.CallConv;
7507 bool IsVarArg = CLI.IsVarArg;
7508 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7509 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7510
7511 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7512 // for the shadow store.
7513 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7514 CCInfo.AllocateStack(32, Align(16));
7515
7516 unsigned NumArgs = Outs.size();
7517 for (unsigned i = 0; i != NumArgs; ++i) {
7518 MVT ArgVT = Outs[i].VT;
7519 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7520
7521 bool UseVarArgCC = false;
7522 if (IsVarArg) {
7523 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7524 // too, so use the vararg CC to force them to integer registers.
7525 if (IsCalleeWin64) {
7526 UseVarArgCC = true;
7527 } else {
7528 UseVarArgCC = !Outs[i].IsFixed;
7529 }
7530 }
7531
7532 if (!UseVarArgCC) {
7533 // Get type of the original argument.
7534 EVT ActualVT =
7535 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7536 /*AllowUnknown*/ true);
7537 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7538 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7539 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7540 ArgVT = MVT::i8;
7541 else if (ActualMVT == MVT::i16)
7542 ArgVT = MVT::i16;
7543 }
7544
7545 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7546 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7547 assert(!Res && "Call operand has unhandled type");
7548 (void)Res;
7549 }
7550}
7551
7552bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7553 const CallLoweringInfo &CLI) const {
7554 CallingConv::ID CalleeCC = CLI.CallConv;
7555 if (!mayTailCallThisCC(CalleeCC))
7556 return false;
7557
7558 SDValue Callee = CLI.Callee;
7559 bool IsVarArg = CLI.IsVarArg;
7560 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7561 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7562 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7563 const SelectionDAG &DAG = CLI.DAG;
7565 const Function &CallerF = MF.getFunction();
7566 CallingConv::ID CallerCC = CallerF.getCallingConv();
7567
7568 // SME Streaming functions are not eligible for TCO as they may require
7569 // the streaming mode or ZA to be restored after returning from the call.
7570 SMEAttrs CallerAttrs(MF.getFunction());
7571 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7572 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7573 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7574 CallerAttrs.hasStreamingBody())
7575 return false;
7576
7577 // Functions using the C or Fast calling convention that have an SVE signature
7578 // preserve more registers and should assume the SVE_VectorCall CC.
7579 // The check for matching callee-saved regs will determine whether it is
7580 // eligible for TCO.
7581 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7584
7585 bool CCMatch = CallerCC == CalleeCC;
7586
7587 // When using the Windows calling convention on a non-windows OS, we want
7588 // to back up and restore X18 in such functions; we can't do a tail call
7589 // from those functions.
7590 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7591 CalleeCC != CallingConv::Win64)
7592 return false;
7593
7594 // Byval parameters hand the function a pointer directly into the stack area
7595 // we want to reuse during a tail call. Working around this *is* possible (see
7596 // X86) but less efficient and uglier in LowerCall.
7597 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7598 e = CallerF.arg_end();
7599 i != e; ++i) {
7600 if (i->hasByValAttr())
7601 return false;
7602
7603 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7604 // In this case, it is necessary to save/restore X0 in the callee. Tail
7605 // call opt interferes with this. So we disable tail call opt when the
7606 // caller has an argument with "inreg" attribute.
7607
7608 // FIXME: Check whether the callee also has an "inreg" argument.
7609 if (i->hasInRegAttr())
7610 return false;
7611 }
7612
7613 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7614 return CCMatch;
7615
7616 // Externally-defined functions with weak linkage should not be
7617 // tail-called on AArch64 when the OS does not support dynamic
7618 // pre-emption of symbols, as the AAELF spec requires normal calls
7619 // to undefined weak functions to be replaced with a NOP or jump to the
7620 // next instruction. The behaviour of branch instructions in this
7621 // situation (as used for tail calls) is implementation-defined, so we
7622 // cannot rely on the linker replacing the tail call with a return.
7623 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7624 const GlobalValue *GV = G->getGlobal();
7626 if (GV->hasExternalWeakLinkage() &&
7627 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7628 return false;
7629 }
7630
7631 // Now we search for cases where we can use a tail call without changing the
7632 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7633 // concept.
7634
7635 // I want anyone implementing a new calling convention to think long and hard
7636 // about this assert.
7637 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7638 "Unexpected variadic calling convention");
7639
7640 LLVMContext &C = *DAG.getContext();
7641 // Check that the call results are passed in the same way.
7642 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7643 CCAssignFnForCall(CalleeCC, IsVarArg),
7644 CCAssignFnForCall(CallerCC, IsVarArg)))
7645 return false;
7646 // The callee has to preserve all registers the caller needs to preserve.
7647 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7648 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7649 if (!CCMatch) {
7650 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7651 if (Subtarget->hasCustomCallingConv()) {
7652 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7653 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7654 }
7655 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7656 return false;
7657 }
7658
7659 // Nothing more to check if the callee is taking no arguments
7660 if (Outs.empty())
7661 return true;
7662
7664 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7665
7666 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7667
7668 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7669 // When we are musttail, additional checks have been done and we can safely ignore this check
7670 // At least two cases here: if caller is fastcc then we can't have any
7671 // memory arguments (we'd be expected to clean up the stack afterwards). If
7672 // caller is C then we could potentially use its argument area.
7673
7674 // FIXME: for now we take the most conservative of these in both cases:
7675 // disallow all variadic memory operands.
7676 for (const CCValAssign &ArgLoc : ArgLocs)
7677 if (!ArgLoc.isRegLoc())
7678 return false;
7679 }
7680
7681 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7682
7683 // If any of the arguments is passed indirectly, it must be SVE, so the
7684 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7685 // allocate space on the stack. That is why we determine this explicitly here
7686 // the call cannot be a tailcall.
7687 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7688 assert((A.getLocInfo() != CCValAssign::Indirect ||
7689 A.getValVT().isScalableVector() ||
7690 Subtarget->isWindowsArm64EC()) &&
7691 "Expected value to be scalable");
7692 return A.getLocInfo() == CCValAssign::Indirect;
7693 }))
7694 return false;
7695
7696 // If the stack arguments for this call do not fit into our own save area then
7697 // the call cannot be made tail.
7698 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7699 return false;
7700
7701 const MachineRegisterInfo &MRI = MF.getRegInfo();
7702 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7703 return false;
7704
7705 return true;
7706}
7707
7708SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7709 SelectionDAG &DAG,
7710 MachineFrameInfo &MFI,
7711 int ClobberedFI) const {
7712 SmallVector<SDValue, 8> ArgChains;
7713 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7714 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7715
7716 // Include the original chain at the beginning of the list. When this is
7717 // used by target LowerCall hooks, this helps legalize find the
7718 // CALLSEQ_BEGIN node.
7719 ArgChains.push_back(Chain);
7720
7721 // Add a chain value for each stack argument corresponding
7722 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7723 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7724 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7725 if (FI->getIndex() < 0) {
7726 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7727 int64_t InLastByte = InFirstByte;
7728 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7729
7730 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7731 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7732 ArgChains.push_back(SDValue(L, 1));
7733 }
7734
7735 // Build a tokenfactor for all the chains.
7736 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7737}
7738
7739bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7740 bool TailCallOpt) const {
7741 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7742 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7743}
7744
7745// Check if the value is zero-extended from i1 to i8
7746static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7747 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7748 if (SizeInBits < 8)
7749 return false;
7750
7751 APInt RequredZero(SizeInBits, 0xFE);
7752 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7753 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7754 return ZExtBool;
7755}
7756
7757void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7758 SDNode *Node) const {
7759 // Live-in physreg copies that are glued to SMSTART are applied as
7760 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7761 // register allocator to pass call args in callee saved regs, without extra
7762 // copies to avoid these fake clobbers of actually-preserved GPRs.
7763 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7764 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7765 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7766 if (MachineOperand &MO = MI.getOperand(I);
7767 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7768 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7769 AArch64::GPR64RegClass.contains(MO.getReg())))
7770 MI.removeOperand(I);
7771
7772 // The SVE vector length can change when entering/leaving streaming mode.
7773 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7774 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7775 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7776 /*IsImplicit=*/true));
7777 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7778 /*IsImplicit=*/true));
7779 }
7780 }
7781
7782 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7783 // have nothing to do with VG, were it not that they are used to materialise a
7784 // frame-address. If they contain a frame-index to a scalable vector, this
7785 // will likely require an ADDVL instruction to materialise the address, thus
7786 // reading VG.
7787 const MachineFunction &MF = *MI.getMF();
7789 (MI.getOpcode() == AArch64::ADDXri ||
7790 MI.getOpcode() == AArch64::SUBXri)) {
7791 const MachineOperand &MO = MI.getOperand(1);
7792 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7794 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7795 /*IsImplicit=*/true));
7796 }
7797}
7798
7800 bool Enable, SDValue Chain,
7801 SDValue InGlue,
7802 unsigned Condition,
7803 SDValue PStateSM) const {
7806 FuncInfo->setHasStreamingModeChanges(true);
7807
7808 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7809 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7810 SDValue MSROp =
7811 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7812 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7813 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7814 if (Condition != AArch64SME::Always) {
7815 assert(PStateSM && "PStateSM should be defined");
7816 Ops.push_back(PStateSM);
7817 }
7818 Ops.push_back(RegMask);
7819
7820 if (InGlue)
7821 Ops.push_back(InGlue);
7822
7823 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7824 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7825}
7826
7827static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7828 const SMEAttrs &CalleeAttrs) {
7829 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7830 CallerAttrs.hasStreamingBody())
7831 return AArch64SME::Always;
7832 if (CalleeAttrs.hasNonStreamingInterface())
7834 if (CalleeAttrs.hasStreamingInterface())
7836
7837 llvm_unreachable("Unsupported attributes");
7838}
7839
7840/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7841/// and add input and output parameter nodes.
7842SDValue
7843AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7844 SmallVectorImpl<SDValue> &InVals) const {
7845 SelectionDAG &DAG = CLI.DAG;
7846 SDLoc &DL = CLI.DL;
7847 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7848 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7850 SDValue Chain = CLI.Chain;
7851 SDValue Callee = CLI.Callee;
7852 bool &IsTailCall = CLI.IsTailCall;
7853 CallingConv::ID &CallConv = CLI.CallConv;
7854 bool IsVarArg = CLI.IsVarArg;
7855
7858 bool IsThisReturn = false;
7859
7861 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7862 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7863 bool IsSibCall = false;
7864 bool GuardWithBTI = false;
7865
7866 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7867 !Subtarget->noBTIAtReturnTwice()) {
7868 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7869 }
7870
7871 // Analyze operands of the call, assigning locations to each operand.
7873 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7874
7875 if (IsVarArg) {
7876 unsigned NumArgs = Outs.size();
7877
7878 for (unsigned i = 0; i != NumArgs; ++i) {
7879 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7880 report_fatal_error("Passing SVE types to variadic functions is "
7881 "currently not supported");
7882 }
7883 }
7884
7885 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7886
7887 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7888 // Assign locations to each value returned by this call.
7890 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7891 *DAG.getContext());
7892 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7893
7894 // Check callee args/returns for SVE registers and set calling convention
7895 // accordingly.
7896 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7897 auto HasSVERegLoc = [](CCValAssign &Loc) {
7898 if (!Loc.isRegLoc())
7899 return false;
7900 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7901 AArch64::PPRRegClass.contains(Loc.getLocReg());
7902 };
7903 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7905 }
7906
7907 if (IsTailCall) {
7908 // Check if it's really possible to do a tail call.
7909 IsTailCall = isEligibleForTailCallOptimization(CLI);
7910
7911 // A sibling call is one where we're under the usual C ABI and not planning
7912 // to change that but can still do a tail call:
7913 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7914 CallConv != CallingConv::SwiftTail)
7915 IsSibCall = true;
7916
7917 if (IsTailCall)
7918 ++NumTailCalls;
7919 }
7920
7921 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7922 report_fatal_error("failed to perform tail call elimination on a call "
7923 "site marked musttail");
7924
7925 // Get a count of how many bytes are to be pushed on the stack.
7926 unsigned NumBytes = CCInfo.getStackSize();
7927
7928 if (IsSibCall) {
7929 // Since we're not changing the ABI to make this a tail call, the memory
7930 // operands are already available in the caller's incoming argument space.
7931 NumBytes = 0;
7932 }
7933
7934 // FPDiff is the byte offset of the call's argument area from the callee's.
7935 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7936 // by this amount for a tail call. In a sibling call it must be 0 because the
7937 // caller will deallocate the entire stack and the callee still expects its
7938 // arguments to begin at SP+0. Completely unused for non-tail calls.
7939 int FPDiff = 0;
7940
7941 if (IsTailCall && !IsSibCall) {
7942 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7943
7944 // Since callee will pop argument stack as a tail call, we must keep the
7945 // popped size 16-byte aligned.
7946 NumBytes = alignTo(NumBytes, 16);
7947
7948 // FPDiff will be negative if this tail call requires more space than we
7949 // would automatically have in our incoming argument space. Positive if we
7950 // can actually shrink the stack.
7951 FPDiff = NumReusableBytes - NumBytes;
7952
7953 // Update the required reserved area if this is the tail call requiring the
7954 // most argument stack space.
7955 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7956 FuncInfo->setTailCallReservedStack(-FPDiff);
7957
7958 // The stack pointer must be 16-byte aligned at all times it's used for a
7959 // memory operation, which in practice means at *all* times and in
7960 // particular across call boundaries. Therefore our own arguments started at
7961 // a 16-byte aligned SP and the delta applied for the tail call should
7962 // satisfy the same constraint.
7963 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7964 }
7965
7966 // Determine whether we need any streaming mode changes.
7967 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7968 if (CLI.CB)
7969 CalleeAttrs = SMEAttrs(*CLI.CB);
7970 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7971 CalleeAttrs = SMEAttrs(ES->getSymbol());
7972
7973 auto DescribeCallsite =
7975 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7976 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7977 R << ore::NV("Callee", ES->getSymbol());
7978 else if (CLI.CB && CLI.CB->getCalledFunction())
7979 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7980 else
7981 R << "unknown callee";
7982 R << "'";
7983 return R;
7984 };
7985
7986 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7987 if (RequiresLazySave) {
7988 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7990 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
7992 SDValue NumZaSaveSlicesAddr =
7993 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7994 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7995 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7996 DAG.getConstant(1, DL, MVT::i32));
7997 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7998 MPI, MVT::i16);
7999 Chain = DAG.getNode(
8000 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8001 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8002 TPIDR2ObjAddr);
8004 ORE.emit([&]() {
8005 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8006 CLI.CB)
8007 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8008 &MF.getFunction());
8009 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8010 });
8011 }
8012
8013 SDValue PStateSM;
8014 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8015 if (RequiresSMChange) {
8016 if (CallerAttrs.hasStreamingInterfaceOrBody())
8017 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8018 else if (CallerAttrs.hasNonStreamingInterface())
8019 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8020 else
8021 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8023 ORE.emit([&]() {
8024 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8025 CLI.CB)
8026 : OptimizationRemarkAnalysis("sme", "SMETransition",
8027 &MF.getFunction());
8028 DescribeCallsite(R) << " requires a streaming mode transition";
8029 return R;
8030 });
8031 }
8032
8033 SDValue ZTFrameIdx;
8034 MachineFrameInfo &MFI = MF.getFrameInfo();
8035 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8036
8037 // If the caller has ZT0 state which will not be preserved by the callee,
8038 // spill ZT0 before the call.
8039 if (ShouldPreserveZT0) {
8040 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8041 ZTFrameIdx = DAG.getFrameIndex(
8042 ZTObj,
8044
8045 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8046 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8047 }
8048
8049 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8050 // PSTATE.ZA before the call if there is no lazy-save active.
8051 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8052 assert((!DisableZA || !RequiresLazySave) &&
8053 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8054
8055 if (DisableZA)
8056 Chain = DAG.getNode(
8057 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8058 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8059 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8060
8061 // Adjust the stack pointer for the new arguments...
8062 // These operations are automatically eliminated by the prolog/epilog pass
8063 if (!IsSibCall)
8064 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8065
8066 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8068
8070 SmallSet<unsigned, 8> RegsUsed;
8071 SmallVector<SDValue, 8> MemOpChains;
8072 auto PtrVT = getPointerTy(DAG.getDataLayout());
8073
8074 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8075 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8076 for (const auto &F : Forwards) {
8077 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8078 RegsToPass.emplace_back(F.PReg, Val);
8079 }
8080 }
8081
8082 // Walk the register/memloc assignments, inserting copies/loads.
8083 unsigned ExtraArgLocs = 0;
8084 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8085 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8086 SDValue Arg = OutVals[i];
8087 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8088
8089 // Promote the value if needed.
8090 switch (VA.getLocInfo()) {
8091 default:
8092 llvm_unreachable("Unknown loc info!");
8093 case CCValAssign::Full:
8094 break;
8095 case CCValAssign::SExt:
8096 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8097 break;
8098 case CCValAssign::ZExt:
8099 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8100 break;
8101 case CCValAssign::AExt:
8102 if (Outs[i].ArgVT == MVT::i1) {
8103 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8104 //
8105 // Check if we actually have to do this, because the value may
8106 // already be zero-extended.
8107 //
8108 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8109 // and rely on DAGCombiner to fold this, because the following
8110 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8111 //
8112 // (ext (zext x)) -> (zext x)
8113 //
8114 // This will give us (zext i32), which we cannot remove, so
8115 // try to check this beforehand.
8116 if (!checkZExtBool(Arg, DAG)) {
8117 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8118 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8119 }
8120 }
8121 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8122 break;
8124 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8125 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8126 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8127 DAG.getConstant(32, DL, VA.getLocVT()));
8128 break;
8129 case CCValAssign::BCvt:
8130 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8131 break;
8132 case CCValAssign::Trunc:
8133 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8134 break;
8135 case CCValAssign::FPExt:
8136 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8137 break;
8139 bool isScalable = VA.getValVT().isScalableVT();
8140 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8141 "Indirect arguments should be scalable on most subtargets");
8142
8143 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8144 uint64_t PartSize = StoreSize;
8145 unsigned NumParts = 1;
8146 if (Outs[i].Flags.isInConsecutiveRegs()) {
8147 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8148 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8149 ++NumParts;
8150 StoreSize *= NumParts;
8151 }
8152
8153 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8154 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8155 MachineFrameInfo &MFI = MF.getFrameInfo();
8156 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8157 if (isScalable)
8159
8163 SDValue SpillSlot = Ptr;
8164
8165 // Ensure we generate all stores for each tuple part, whilst updating the
8166 // pointer after each store correctly using vscale.
8167 while (NumParts) {
8168 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8169 MemOpChains.push_back(Store);
8170
8171 NumParts--;
8172 if (NumParts > 0) {
8173 SDValue BytesIncrement;
8174 if (isScalable) {
8175 BytesIncrement = DAG.getVScale(
8176 DL, Ptr.getValueType(),
8177 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8178 } else {
8179 BytesIncrement = DAG.getConstant(
8180 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8181 Ptr.getValueType());
8182 }
8184 Flags.setNoUnsignedWrap(true);
8185
8186 MPI = MachinePointerInfo(MPI.getAddrSpace());
8187 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8188 BytesIncrement, Flags);
8189 ExtraArgLocs++;
8190 i++;
8191 }
8192 }
8193
8194 Arg = SpillSlot;
8195 break;
8196 }
8197
8198 if (VA.isRegLoc()) {
8199 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8200 Outs[0].VT == MVT::i64) {
8201 assert(VA.getLocVT() == MVT::i64 &&
8202 "unexpected calling convention register assignment");
8203 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8204 "unexpected use of 'returned'");
8205 IsThisReturn = true;
8206 }
8207 if (RegsUsed.count(VA.getLocReg())) {
8208 // If this register has already been used then we're trying to pack
8209 // parts of an [N x i32] into an X-register. The extension type will
8210 // take care of putting the two halves in the right place but we have to
8211 // combine them.
8212 SDValue &Bits =
8213 llvm::find_if(RegsToPass,
8214 [=](const std::pair<unsigned, SDValue> &Elt) {
8215 return Elt.first == VA.getLocReg();
8216 })
8217 ->second;
8218 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8219 // Call site info is used for function's parameter entry value
8220 // tracking. For now we track only simple cases when parameter
8221 // is transferred through whole register.
8222 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
8223 return ArgReg.Reg == VA.getLocReg();
8224 });
8225 } else {
8226 // Add an extra level of indirection for streaming mode changes by
8227 // using a pseudo copy node that cannot be rematerialised between a
8228 // smstart/smstop and the call by the simple register coalescer.
8229 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8231 Arg.getValueType(), Arg);
8232 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8233 RegsUsed.insert(VA.getLocReg());
8234 const TargetOptions &Options = DAG.getTarget().Options;
8235 if (Options.EmitCallSiteInfo)
8236 CSInfo.emplace_back(VA.getLocReg(), i);
8237 }
8238 } else {
8239 assert(VA.isMemLoc());
8240
8241 SDValue DstAddr;
8242 MachinePointerInfo DstInfo;
8243
8244 // FIXME: This works on big-endian for composite byvals, which are the
8245 // common case. It should also work for fundamental types too.
8246 uint32_t BEAlign = 0;
8247 unsigned OpSize;
8248 if (VA.getLocInfo() == CCValAssign::Indirect ||
8250 OpSize = VA.getLocVT().getFixedSizeInBits();
8251 else
8252 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8253 : VA.getValVT().getSizeInBits();
8254 OpSize = (OpSize + 7) / 8;
8255 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8256 !Flags.isInConsecutiveRegs()) {
8257 if (OpSize < 8)
8258 BEAlign = 8 - OpSize;
8259 }
8260 unsigned LocMemOffset = VA.getLocMemOffset();
8261 int32_t Offset = LocMemOffset + BEAlign;
8262 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8263 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8264
8265 if (IsTailCall) {
8266 Offset = Offset + FPDiff;
8267 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8268
8269 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8270 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8271
8272 // Make sure any stack arguments overlapping with where we're storing
8273 // are loaded before this eventual operation. Otherwise they'll be
8274 // clobbered.
8275 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8276 } else {
8277 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8278
8279 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8280 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8281 }
8282
8283 if (Outs[i].Flags.isByVal()) {
8284 SDValue SizeNode =
8285 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8286 SDValue Cpy = DAG.getMemcpy(
8287 Chain, DL, DstAddr, Arg, SizeNode,
8288 Outs[i].Flags.getNonZeroByValAlign(),
8289 /*isVol = */ false, /*AlwaysInline = */ false,
8290 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8291
8292 MemOpChains.push_back(Cpy);
8293 } else {
8294 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8295 // promoted to a legal register type i32, we should truncate Arg back to
8296 // i1/i8/i16.
8297 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8298 VA.getValVT() == MVT::i16)
8299 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8300
8301 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8302 MemOpChains.push_back(Store);
8303 }
8304 }
8305 }
8306
8307 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8308 SDValue ParamPtr = StackPtr;
8309 if (IsTailCall) {
8310 // Create a dummy object at the top of the stack that can be used to get
8311 // the SP after the epilogue
8312 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8313 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8314 }
8315
8316 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8317 // describing the argument list. x4 contains the address of the
8318 // first stack parameter. x5 contains the size in bytes of all parameters
8319 // passed on the stack.
8320 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8321 RegsToPass.emplace_back(AArch64::X5,
8322 DAG.getConstant(NumBytes, DL, MVT::i64));
8323 }
8324
8325 if (!MemOpChains.empty())
8326 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8327
8328 SDValue InGlue;
8329 if (RequiresSMChange) {
8330 SDValue NewChain = changeStreamingMode(
8331 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8332 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8333 Chain = NewChain.getValue(0);
8334 InGlue = NewChain.getValue(1);
8335 }
8336
8337 // Build a sequence of copy-to-reg nodes chained together with token chain
8338 // and flag operands which copy the outgoing args into the appropriate regs.
8339 for (auto &RegToPass : RegsToPass) {
8340 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8341 RegToPass.second, InGlue);
8342 InGlue = Chain.getValue(1);
8343 }
8344
8345 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8346 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8347 // node so that legalize doesn't hack it.
8348 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8349 auto GV = G->getGlobal();
8350 unsigned OpFlags =
8352 if (OpFlags & AArch64II::MO_GOT) {
8353 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8354 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8355 } else {
8356 const GlobalValue *GV = G->getGlobal();
8357 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8358 }
8359 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8360 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8361 Subtarget->isTargetMachO()) ||
8363 const char *Sym = S->getSymbol();
8364 if (UseGot) {
8366 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8367 } else {
8368 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8369 }
8370 }
8371
8372 // We don't usually want to end the call-sequence here because we would tidy
8373 // the frame up *after* the call, however in the ABI-changing tail-call case
8374 // we've carefully laid out the parameters so that when sp is reset they'll be
8375 // in the correct location.
8376 if (IsTailCall && !IsSibCall) {
8377 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8378 InGlue = Chain.getValue(1);
8379 }
8380
8381 std::vector<SDValue> Ops;
8382 Ops.push_back(Chain);
8383 Ops.push_back(Callee);
8384
8385 if (IsTailCall) {
8386 // Each tail call may have to adjust the stack by a different amount, so
8387 // this information must travel along with the operation for eventual
8388 // consumption by emitEpilogue.
8389 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8390 }
8391
8392 // Add argument registers to the end of the list so that they are known live
8393 // into the call.
8394 for (auto &RegToPass : RegsToPass)
8395 Ops.push_back(DAG.getRegister(RegToPass.first,
8396 RegToPass.second.getValueType()));
8397
8398 // Add a register mask operand representing the call-preserved registers.
8399 const uint32_t *Mask;
8400 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8401 if (IsThisReturn) {
8402 // For 'this' returns, use the X0-preserving mask if applicable
8403 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8404 if (!Mask) {
8405 IsThisReturn = false;
8406 Mask = TRI->getCallPreservedMask(MF, CallConv);
8407 }
8408 } else
8409 Mask = TRI->getCallPreservedMask(MF, CallConv);
8410
8411 if (Subtarget->hasCustomCallingConv())
8412 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8413
8414 if (TRI->isAnyArgRegReserved(MF))
8415 TRI->emitReservedArgRegCallError(MF);
8416
8417 assert(Mask && "Missing call preserved mask for calling convention");
8418 Ops.push_back(DAG.getRegisterMask(Mask));
8419
8420 if (InGlue.getNode())
8421 Ops.push_back(InGlue);
8422
8423 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8424
8425 // If we're doing a tall call, use a TC_RETURN here rather than an
8426 // actual call instruction.
8427 if (IsTailCall) {
8429 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8430
8431 if (IsCFICall)
8432 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8433
8434 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8435 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8436 return Ret;
8437 }
8438
8439 unsigned CallOpc = AArch64ISD::CALL;
8440 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8441 // be expanded to the call, directly followed by a special marker sequence and
8442 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8443 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8444 assert(!IsTailCall &&
8445 "tail calls cannot be marked with clang.arc.attachedcall");
8446 CallOpc = AArch64ISD::CALL_RVMARKER;
8447
8448 // Add a target global address for the retainRV/claimRV runtime function
8449 // just before the call target.
8450 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8451 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8452 Ops.insert(Ops.begin() + 1, GA);
8453 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8455 } else if (GuardWithBTI) {
8456 CallOpc = AArch64ISD::CALL_BTI;
8457 }
8458
8459 // Returns a chain and a flag for retval copy to use.
8460 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8461
8462 if (IsCFICall)
8463 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8464
8465 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8466 InGlue = Chain.getValue(1);
8467 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8468
8469 uint64_t CalleePopBytes =
8470 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8471
8472 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8473 InGlue = Chain.getValue(1);
8474
8475 // Handle result values, copying them out of physregs into vregs that we
8476 // return.
8477 SDValue Result = LowerCallResult(
8478 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8479 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8480
8481 if (!Ins.empty())
8482 InGlue = Result.getValue(Result->getNumValues() - 1);
8483
8484 if (RequiresSMChange) {
8485 assert(PStateSM && "Expected a PStateSM to be set");
8487 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8488 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8489 }
8490
8491 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8492 // Unconditionally resume ZA.
8493 Result = DAG.getNode(
8494 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8495 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8496 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8497
8498 if (ShouldPreserveZT0)
8499 Result =
8500 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8501 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8502
8503 if (RequiresLazySave) {
8504 // Conditionally restore the lazy save using a pseudo node.
8505 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8506 SDValue RegMask = DAG.getRegisterMask(
8507 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8508 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8509 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8510 SDValue TPIDR2_EL0 = DAG.getNode(
8511 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8512 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8513
8514 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8515 // RESTORE_ZA pseudo.
8516 SDValue Glue;
8517 SDValue TPIDR2Block = DAG.getFrameIndex(
8519 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8520 Result =
8521 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8522 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8523 RestoreRoutine, RegMask, Result.getValue(1)});
8524
8525 // Finally reset the TPIDR2_EL0 register to 0.
8526 Result = DAG.getNode(
8527 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8528 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8529 DAG.getConstant(0, DL, MVT::i64));
8530 }
8531
8532 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8533 for (unsigned I = 0; I < InVals.size(); ++I) {
8534 // The smstart/smstop is chained as part of the call, but when the
8535 // resulting chain is discarded (which happens when the call is not part
8536 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8537 // smstart/smstop is chained to the result value. We can do that by doing
8538 // a vreg -> vreg copy.
8540 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8541 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8542 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8543 InVals[I].getValueType());
8544 }
8545 }
8546
8547 return Result;
8548}
8549
8550bool AArch64TargetLowering::CanLowerReturn(
8551 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8552 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8553 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8555 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8556 return CCInfo.CheckReturn(Outs, RetCC);
8557}
8558
8559SDValue
8560AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8561 bool isVarArg,
8563 const SmallVectorImpl<SDValue> &OutVals,
8564 const SDLoc &DL, SelectionDAG &DAG) const {
8565 auto &MF = DAG.getMachineFunction();
8566 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8567
8568 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8570 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8571 CCInfo.AnalyzeReturn(Outs, RetCC);
8572
8573 // Copy the result values into the output registers.
8574 SDValue Glue;
8576 SmallSet<unsigned, 4> RegsUsed;
8577 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8578 ++i, ++realRVLocIdx) {
8579 CCValAssign &VA = RVLocs[i];
8580 assert(VA.isRegLoc() && "Can only return in registers!");
8581 SDValue Arg = OutVals[realRVLocIdx];
8582
8583 switch (VA.getLocInfo()) {
8584 default:
8585 llvm_unreachable("Unknown loc info!");
8586 case CCValAssign::Full:
8587 if (Outs[i].ArgVT == MVT::i1) {
8588 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8589 // value. This is strictly redundant on Darwin (which uses "zeroext
8590 // i1"), but will be optimised out before ISel.
8591 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8592 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8593 }
8594 break;
8595 case CCValAssign::BCvt:
8596 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8597 break;
8598 case CCValAssign::AExt:
8599 case CCValAssign::ZExt:
8600 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8601 break;
8603 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8604 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8605 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8606 DAG.getConstant(32, DL, VA.getLocVT()));
8607 break;
8608 }
8609
8610 if (RegsUsed.count(VA.getLocReg())) {
8611 SDValue &Bits =
8612 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8613 return Elt.first == VA.getLocReg();
8614 })->second;
8615 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8616 } else {
8617 RetVals.emplace_back(VA.getLocReg(), Arg);
8618 RegsUsed.insert(VA.getLocReg());
8619 }
8620 }
8621
8622 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8623
8624 // Emit SMSTOP before returning from a locally streaming function
8625 SMEAttrs FuncAttrs(MF.getFunction());
8626 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8627 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8628 Register Reg = FuncInfo->getPStateSMReg();
8629 assert(Reg.isValid() && "PStateSM Register is invalid");
8630 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8631 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8632 /*Glue*/ SDValue(),
8634 } else
8635 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8636 /*Glue*/ SDValue(), AArch64SME::Always);
8637 Glue = Chain.getValue(1);
8638 }
8639
8640 SmallVector<SDValue, 4> RetOps(1, Chain);
8641 for (auto &RetVal : RetVals) {
8642 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8643 isPassedInFPR(RetVal.second.getValueType()))
8644 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8645 RetVal.second.getValueType(), RetVal.second);
8646 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8647 Glue = Chain.getValue(1);
8648 RetOps.push_back(
8649 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8650 }
8651
8652 // Windows AArch64 ABIs require that for returning structs by value we copy
8653 // the sret argument into X0 for the return.
8654 // We saved the argument into a virtual register in the entry block,
8655 // so now we copy the value out and into X0.
8656 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8657 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8659
8660 unsigned RetValReg = AArch64::X0;
8661 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8662 RetValReg = AArch64::X8;
8663 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8664 Glue = Chain.getValue(1);
8665
8666 RetOps.push_back(
8667 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8668 }
8669
8670 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8671 if (I) {
8672 for (; *I; ++I) {
8673 if (AArch64::GPR64RegClass.contains(*I))
8674 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8675 else if (AArch64::FPR64RegClass.contains(*I))
8676 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8677 else
8678 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8679 }
8680 }
8681
8682 RetOps[0] = Chain; // Update chain.
8683
8684 // Add the glue if we have it.
8685 if (Glue.getNode())
8686 RetOps.push_back(Glue);
8687
8688 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8689 // ARM64EC entry thunks use a special return sequence: instead of a regular
8690 // "ret" instruction, they need to explicitly call the emulator.
8691 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8692 SDValue Arm64ECRetDest =
8693 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8694 Arm64ECRetDest =
8695 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8696 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8698 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8699 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8700 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8701 }
8702
8703 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8704}
8705
8706//===----------------------------------------------------------------------===//
8707// Other Lowering Code
8708//===----------------------------------------------------------------------===//
8709
8710SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8711 SelectionDAG &DAG,
8712 unsigned Flag) const {
8713 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8714 N->getOffset(), Flag);
8715}
8716
8717SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8718 SelectionDAG &DAG,
8719 unsigned Flag) const {
8720 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8721}
8722
8723SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8724 SelectionDAG &DAG,
8725 unsigned Flag) const {
8726 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8727 N->getOffset(), Flag);
8728}
8729
8730SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8731 SelectionDAG &DAG,
8732 unsigned Flag) const {
8733 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8734}
8735
8736SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8737 SelectionDAG &DAG,
8738 unsigned Flag) const {
8739 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8740}
8741
8742// (loadGOT sym)
8743template <class NodeTy>
8744SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8745 unsigned Flags) const {
8746 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8747 SDLoc DL(N);
8748 EVT Ty = getPointerTy(DAG.getDataLayout());
8749 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8750 // FIXME: Once remat is capable of dealing with instructions with register
8751 // operands, expand this into two nodes instead of using a wrapper node.
8752 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8753}
8754
8755// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8756template <class NodeTy>
8757SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8758 unsigned Flags) const {
8759 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8760 SDLoc DL(N);
8761 EVT Ty = getPointerTy(DAG.getDataLayout());
8762 const unsigned char MO_NC = AArch64II::MO_NC;
8763 return DAG.getNode(
8765 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8766 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8767 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8768 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8769}
8770
8771// (addlow (adrp %hi(sym)) %lo(sym))
8772template <class NodeTy>
8773SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8774 unsigned Flags) const {
8775 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8776 SDLoc DL(N);
8777 EVT Ty = getPointerTy(DAG.getDataLayout());
8778 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8779 SDValue Lo = getTargetNode(N, Ty, DAG,
8782 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8783}
8784
8785// (adr sym)
8786template <class NodeTy>
8787SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8788 unsigned Flags) const {
8789 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8790 SDLoc DL(N);
8791 EVT Ty = getPointerTy(DAG.getDataLayout());
8792 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8793 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8794}
8795
8796SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8797 SelectionDAG &DAG) const {
8798 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8799 const GlobalValue *GV = GN->getGlobal();
8800 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8801
8802 if (OpFlags != AArch64II::MO_NO_FLAG)
8803 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8804 "unexpected offset in global node");
8805
8806 // This also catches the large code model case for Darwin, and tiny code
8807 // model with got relocations.
8808 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8809 return getGOT(GN, DAG, OpFlags);
8810 }
8811
8815 Result = getAddrLarge(GN, DAG, OpFlags);
8816 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8817 Result = getAddrTiny(GN, DAG, OpFlags);
8818 } else {
8819 Result = getAddr(GN, DAG, OpFlags);
8820 }
8821 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8822 SDLoc DL(GN);
8824 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8826 return Result;
8827}
8828
8829/// Convert a TLS address reference into the correct sequence of loads
8830/// and calls to compute the variable's address (for Darwin, currently) and
8831/// return an SDValue containing the final node.
8832
8833/// Darwin only has one TLS scheme which must be capable of dealing with the
8834/// fully general situation, in the worst case. This means:
8835/// + "extern __thread" declaration.
8836/// + Defined in a possibly unknown dynamic library.
8837///
8838/// The general system is that each __thread variable has a [3 x i64] descriptor
8839/// which contains information used by the runtime to calculate the address. The
8840/// only part of this the compiler needs to know about is the first xword, which
8841/// contains a function pointer that must be called with the address of the
8842/// entire descriptor in "x0".
8843///
8844/// Since this descriptor may be in a different unit, in general even the
8845/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8846/// is:
8847/// adrp x0, _var@TLVPPAGE
8848/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8849/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8850/// ; the function pointer
8851/// blr x1 ; Uses descriptor address in x0
8852/// ; Address of _var is now in x0.
8853///
8854/// If the address of _var's descriptor *is* known to the linker, then it can
8855/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8856/// a slight efficiency gain.
8857SDValue
8858AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8859 SelectionDAG &DAG) const {
8860 assert(Subtarget->isTargetDarwin() &&
8861 "This function expects a Darwin target");
8862
8863 SDLoc DL(Op);
8864 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8865 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8866 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8867
8868 SDValue TLVPAddr =
8869 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8870 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8871
8872 // The first entry in the descriptor is a function pointer that we must call
8873 // to obtain the address of the variable.
8874 SDValue Chain = DAG.getEntryNode();
8875 SDValue FuncTLVGet = DAG.getLoad(
8876 PtrMemVT, DL, Chain, DescAddr,
8878 Align(PtrMemVT.getSizeInBits() / 8),
8880 Chain = FuncTLVGet.getValue(1);
8881
8882 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8883 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8884
8886 MFI.setAdjustsStack(true);
8887
8888 // TLS calls preserve all registers except those that absolutely must be
8889 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8890 // silly).
8891 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8892 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8893 if (Subtarget->hasCustomCallingConv())
8894 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8895
8896 // Finally, we can make the call. This is just a degenerate version of a
8897 // normal AArch64 call node: x0 takes the address of the descriptor, and
8898 // returns the address of the variable in this thread.
8899 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8900 Chain =
8901 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8902 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8903 DAG.getRegisterMask(Mask), Chain.getValue(1));
8904 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8905}
8906
8907/// Convert a thread-local variable reference into a sequence of instructions to
8908/// compute the variable's address for the local exec TLS model of ELF targets.
8909/// The sequence depends on the maximum TLS area size.
8910SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8911 SDValue ThreadBase,
8912 const SDLoc &DL,
8913 SelectionDAG &DAG) const {
8914 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8915 SDValue TPOff, Addr;
8916
8917 switch (DAG.getTarget().Options.TLSSize) {
8918 default:
8919 llvm_unreachable("Unexpected TLS size");
8920
8921 case 12: {
8922 // mrs x0, TPIDR_EL0
8923 // add x0, x0, :tprel_lo12:a
8925 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8926 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8927 Var,
8928 DAG.getTargetConstant(0, DL, MVT::i32)),
8929 0);
8930 }
8931
8932 case 24: {
8933 // mrs x0, TPIDR_EL0
8934 // add x0, x0, :tprel_hi12:a
8935 // add x0, x0, :tprel_lo12_nc:a
8936 SDValue HiVar = DAG.getTargetGlobalAddress(
8937 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8938 SDValue LoVar = DAG.getTargetGlobalAddress(
8939 GV, DL, PtrVT, 0,
8941 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8942 HiVar,
8943 DAG.getTargetConstant(0, DL, MVT::i32)),
8944 0);
8945 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8946 LoVar,
8947 DAG.getTargetConstant(0, DL, MVT::i32)),
8948 0);
8949 }
8950
8951 case 32: {
8952 // mrs x1, TPIDR_EL0
8953 // movz x0, #:tprel_g1:a
8954 // movk x0, #:tprel_g0_nc:a
8955 // add x0, x1, x0
8956 SDValue HiVar = DAG.getTargetGlobalAddress(
8957 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8958 SDValue LoVar = DAG.getTargetGlobalAddress(
8959 GV, DL, PtrVT, 0,
8961 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8962 DAG.getTargetConstant(16, DL, MVT::i32)),
8963 0);
8964 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8965 DAG.getTargetConstant(0, DL, MVT::i32)),
8966 0);
8967 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8968 }
8969
8970 case 48: {
8971 // mrs x1, TPIDR_EL0
8972 // movz x0, #:tprel_g2:a
8973 // movk x0, #:tprel_g1_nc:a
8974 // movk x0, #:tprel_g0_nc:a
8975 // add x0, x1, x0
8976 SDValue HiVar = DAG.getTargetGlobalAddress(
8977 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8978 SDValue MiVar = DAG.getTargetGlobalAddress(
8979 GV, DL, PtrVT, 0,
8981 SDValue LoVar = DAG.getTargetGlobalAddress(
8982 GV, DL, PtrVT, 0,
8984 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8985 DAG.getTargetConstant(32, DL, MVT::i32)),
8986 0);
8987 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8988 DAG.getTargetConstant(16, DL, MVT::i32)),
8989 0);
8990 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8991 DAG.getTargetConstant(0, DL, MVT::i32)),
8992 0);
8993 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8994 }
8995 }
8996}
8997
8998/// When accessing thread-local variables under either the general-dynamic or
8999/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9000/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9001/// is a function pointer to carry out the resolution.
9002///
9003/// The sequence is:
9004/// adrp x0, :tlsdesc:var
9005/// ldr x1, [x0, #:tlsdesc_lo12:var]
9006/// add x0, x0, #:tlsdesc_lo12:var
9007/// .tlsdesccall var
9008/// blr x1
9009/// (TPIDR_EL0 offset now in x0)
9010///
9011/// The above sequence must be produced unscheduled, to enable the linker to
9012/// optimize/relax this sequence.
9013/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9014/// above sequence, and expanded really late in the compilation flow, to ensure
9015/// the sequence is produced as per above.
9016SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9017 const SDLoc &DL,
9018 SelectionDAG &DAG) const {
9019 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9020
9021 SDValue Chain = DAG.getEntryNode();
9022 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9023
9024 Chain =
9025 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9026 SDValue Glue = Chain.getValue(1);
9027
9028 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9029}
9030
9031SDValue
9032AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9033 SelectionDAG &DAG) const {
9034 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9035
9036 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9037
9039
9041 if (Model == TLSModel::LocalDynamic)
9043 }
9044
9046 Model != TLSModel::LocalExec)
9047 report_fatal_error("ELF TLS only supported in small memory model or "
9048 "in local exec TLS model");
9049 // Different choices can be made for the maximum size of the TLS area for a
9050 // module. For the small address model, the default TLS size is 16MiB and the
9051 // maximum TLS size is 4GiB.
9052 // FIXME: add tiny and large code model support for TLS access models other
9053 // than local exec. We currently generate the same code as small for tiny,
9054 // which may be larger than needed.
9055
9056 SDValue TPOff;
9057 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9058 SDLoc DL(Op);
9059 const GlobalValue *GV = GA->getGlobal();
9060
9061 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9062
9063 if (Model == TLSModel::LocalExec) {
9064 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9065 } else if (Model == TLSModel::InitialExec) {
9066 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9067 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9068 } else if (Model == TLSModel::LocalDynamic) {
9069 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9070 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9071 // the beginning of the module's TLS region, followed by a DTPREL offset
9072 // calculation.
9073
9074 // These accesses will need deduplicating if there's more than one.
9075 AArch64FunctionInfo *MFI =
9078
9079 // The call needs a relocation too for linker relaxation. It doesn't make
9080 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9081 // the address.
9082 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9084
9085 // Now we can calculate the offset from TPIDR_EL0 to this module's
9086 // thread-local area.
9087 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9088
9089 // Now use :dtprel_whatever: operations to calculate this variable's offset
9090 // in its thread-storage area.
9091 SDValue HiVar = DAG.getTargetGlobalAddress(
9092 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9093 SDValue LoVar = DAG.getTargetGlobalAddress(
9094 GV, DL, MVT::i64, 0,
9096
9097 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9098 DAG.getTargetConstant(0, DL, MVT::i32)),
9099 0);
9100 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9101 DAG.getTargetConstant(0, DL, MVT::i32)),
9102 0);
9103 } else if (Model == TLSModel::GeneralDynamic) {
9104 // The call needs a relocation too for linker relaxation. It doesn't make
9105 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9106 // the address.
9107 SDValue SymAddr =
9108 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9109
9110 // Finally we can make a call to calculate the offset from tpidr_el0.
9111 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9112 } else
9113 llvm_unreachable("Unsupported ELF TLS access model");
9114
9115 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9116}
9117
9118SDValue
9119AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9120 SelectionDAG &DAG) const {
9121 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9122
9123 SDValue Chain = DAG.getEntryNode();
9124 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9125 SDLoc DL(Op);
9126
9127 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9128
9129 // Load the ThreadLocalStoragePointer from the TEB
9130 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9131 SDValue TLSArray =
9132 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9133 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9134 Chain = TLSArray.getValue(1);
9135
9136 // Load the TLS index from the C runtime;
9137 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9138 // This also does the same as LOADgot, but using a generic i32 load,
9139 // while LOADgot only loads i64.
9140 SDValue TLSIndexHi =
9141 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9142 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9143 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9144 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9145 SDValue TLSIndex =
9146 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9147 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9148 Chain = TLSIndex.getValue(1);
9149
9150 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9151 // offset into the TLSArray.
9152 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9153 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9154 DAG.getConstant(3, DL, PtrVT));
9155 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9156 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9158 Chain = TLS.getValue(1);
9159
9160 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9161 const GlobalValue *GV = GA->getGlobal();
9162 SDValue TGAHi = DAG.getTargetGlobalAddress(
9163 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9164 SDValue TGALo = DAG.getTargetGlobalAddress(
9165 GV, DL, PtrVT, 0,
9167
9168 // Add the offset from the start of the .tls section (section base).
9169 SDValue Addr =
9170 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9171 DAG.getTargetConstant(0, DL, MVT::i32)),
9172 0);
9173 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9174 return Addr;
9175}
9176
9177SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9178 SelectionDAG &DAG) const {
9179 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9180 if (DAG.getTarget().useEmulatedTLS())
9181 return LowerToTLSEmulatedModel(GA, DAG);
9182
9183 if (Subtarget->isTargetDarwin())
9184 return LowerDarwinGlobalTLSAddress(Op, DAG);
9185 if (Subtarget->isTargetELF())
9186 return LowerELFGlobalTLSAddress(Op, DAG);
9187 if (Subtarget->isTargetWindows())
9188 return LowerWindowsGlobalTLSAddress(Op, DAG);
9189
9190 llvm_unreachable("Unexpected platform trying to use TLS");
9191}
9192
9193// Looks through \param Val to determine the bit that can be used to
9194// check the sign of the value. It returns the unextended value and
9195// the sign bit position.
9196std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9197 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9198 return {Val.getOperand(0),
9199 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9200 1};
9201
9202 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9203 return {Val.getOperand(0),
9204 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9205
9206 return {Val, Val.getValueSizeInBits() - 1};
9207}
9208
9209SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9210 SDValue Chain = Op.getOperand(0);
9211 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9212 SDValue LHS = Op.getOperand(2);
9213 SDValue RHS = Op.getOperand(3);
9214 SDValue Dest = Op.getOperand(4);
9215 SDLoc dl(Op);
9216
9218 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9219 // will not be produced, as they are conditional branch instructions that do
9220 // not set flags.
9221 bool ProduceNonFlagSettingCondBr =
9222 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9223
9224 // Handle f128 first, since lowering it will result in comparing the return
9225 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9226 // is expecting to deal with.
9227 if (LHS.getValueType() == MVT::f128) {
9228 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9229
9230 // If softenSetCCOperands returned a scalar, we need to compare the result
9231 // against zero to select between true and false values.
9232 if (!RHS.getNode()) {
9233 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9234 CC = ISD::SETNE;
9235 }
9236 }
9237
9238 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9239 // instruction.
9240 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9241 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9242 // Only lower legal XALUO ops.
9243 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9244 return SDValue();
9245
9246 // The actual operation with overflow check.
9248 SDValue Value, Overflow;
9249 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9250
9251 if (CC == ISD::SETNE)
9252 OFCC = getInvertedCondCode(OFCC);
9253 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9254
9255 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9256 Overflow);
9257 }
9258
9259 if (LHS.getValueType().isInteger()) {
9260 assert((LHS.getValueType() == RHS.getValueType()) &&
9261 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9262
9263 // If the RHS of the comparison is zero, we can potentially fold this
9264 // to a specialized branch.
9265 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9266 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9267 if (CC == ISD::SETEQ) {
9268 // See if we can use a TBZ to fold in an AND as well.
9269 // TBZ has a smaller branch displacement than CBZ. If the offset is
9270 // out of bounds, a late MI-layer pass rewrites branches.
9271 // 403.gcc is an example that hits this case.
9272 if (LHS.getOpcode() == ISD::AND &&
9273 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9274 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9275 SDValue Test = LHS.getOperand(0);
9276 uint64_t Mask = LHS.getConstantOperandVal(1);
9277 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9278 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9279 Dest);
9280 }
9281
9282 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9283 } else if (CC == ISD::SETNE) {
9284 // See if we can use a TBZ to fold in an AND as well.
9285 // TBZ has a smaller branch displacement than CBZ. If the offset is
9286 // out of bounds, a late MI-layer pass rewrites branches.
9287 // 403.gcc is an example that hits this case.
9288 if (LHS.getOpcode() == ISD::AND &&
9289 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9290 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9291 SDValue Test = LHS.getOperand(0);
9292 uint64_t Mask = LHS.getConstantOperandVal(1);
9293 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9294 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9295 Dest);
9296 }
9297
9298 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9299 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9300 // Don't combine AND since emitComparison converts the AND to an ANDS
9301 // (a.k.a. TST) and the test in the test bit and branch instruction
9302 // becomes redundant. This would also increase register pressure.
9303 uint64_t SignBitPos;
9304 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9305 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9306 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9307 }
9308 }
9309 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9310 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9311 // Don't combine AND since emitComparison converts the AND to an ANDS
9312 // (a.k.a. TST) and the test in the test bit and branch instruction
9313 // becomes redundant. This would also increase register pressure.
9314 uint64_t SignBitPos;
9315 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9316 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9317 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9318 }
9319
9320 SDValue CCVal;
9321 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9322 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9323 Cmp);
9324 }
9325
9326 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9327 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9328
9329 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9330 // clean. Some of them require two branches to implement.
9331 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9332 AArch64CC::CondCode CC1, CC2;
9333 changeFPCCToAArch64CC(CC, CC1, CC2);
9334 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9335 SDValue BR1 =
9336 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9337 if (CC2 != AArch64CC::AL) {
9338 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9339 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9340 Cmp);
9341 }
9342
9343 return BR1;
9344}
9345
9346SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9347 SelectionDAG &DAG) const {
9348 if (!Subtarget->hasNEON())
9349 return SDValue();
9350
9351 EVT VT = Op.getValueType();
9352 EVT IntVT = VT.changeTypeToInteger();
9353 SDLoc DL(Op);
9354
9355 SDValue In1 = Op.getOperand(0);
9356 SDValue In2 = Op.getOperand(1);
9357 EVT SrcVT = In2.getValueType();
9358
9359 if (!SrcVT.bitsEq(VT))
9360 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9361
9362 if (VT.isScalableVector())
9363 IntVT =
9365
9366 if (VT.isFixedLengthVector() &&
9367 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9368 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9369
9370 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9371 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9372
9373 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9374 return convertFromScalableVector(DAG, VT, Res);
9375 }
9376
9377 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9378 if (VT.isScalableVector())
9379 return getSVESafeBitCast(VT, Op, DAG);
9380
9381 return DAG.getBitcast(VT, Op);
9382 };
9383
9384 SDValue VecVal1, VecVal2;
9385 EVT VecVT;
9386 auto SetVecVal = [&](int Idx = -1) {
9387 if (!VT.isVector()) {
9388 VecVal1 =
9389 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9390 VecVal2 =
9391 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9392 } else {
9393 VecVal1 = BitCast(VecVT, In1, DAG);
9394 VecVal2 = BitCast(VecVT, In2, DAG);
9395 }
9396 };
9397 if (VT.isVector()) {
9398 VecVT = IntVT;
9399 SetVecVal();
9400 } else if (VT == MVT::f64) {
9401 VecVT = MVT::v2i64;
9402 SetVecVal(AArch64::dsub);
9403 } else if (VT == MVT::f32) {
9404 VecVT = MVT::v4i32;
9405 SetVecVal(AArch64::ssub);
9406 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9407 VecVT = MVT::v8i16;
9408 SetVecVal(AArch64::hsub);
9409 } else {
9410 llvm_unreachable("Invalid type for copysign!");
9411 }
9412
9413 unsigned BitWidth = In1.getScalarValueSizeInBits();
9414 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9415
9416 // We want to materialize a mask with every bit but the high bit set, but the
9417 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9418 // 64-bit elements. Instead, materialize all bits set and then negate that.
9419 if (VT == MVT::f64 || VT == MVT::v2f64) {
9420 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9421 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9422 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9423 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9424 }
9425
9426 SDValue BSP =
9427 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9428 if (VT == MVT::f16 || VT == MVT::bf16)
9429 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9430 if (VT == MVT::f32)
9431 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9432 if (VT == MVT::f64)
9433 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9434
9435 return BitCast(VT, BSP, DAG);
9436}
9437
9438SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9439 SelectionDAG &DAG) const {
9441 Attribute::NoImplicitFloat))
9442 return SDValue();
9443
9444 if (!Subtarget->hasNEON())
9445 return SDValue();
9446
9447 bool IsParity = Op.getOpcode() == ISD::PARITY;
9448 SDValue Val = Op.getOperand(0);
9449 SDLoc DL(Op);
9450 EVT VT = Op.getValueType();
9451
9452 // for i32, general parity function using EORs is more efficient compared to
9453 // using floating point
9454 if (VT == MVT::i32 && IsParity)
9455 return SDValue();
9456
9457 // If there is no CNT instruction available, GPR popcount can
9458 // be more efficiently lowered to the following sequence that uses
9459 // AdvSIMD registers/instructions as long as the copies to/from
9460 // the AdvSIMD registers are cheap.
9461 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9462 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9463 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9464 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9465 if (VT == MVT::i32 || VT == MVT::i64) {
9466 if (VT == MVT::i32)
9467 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9468 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9469
9470 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9471 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9472 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9473 DAG.getConstant(0, DL, MVT::i64));
9474
9475 if (IsParity)
9476 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9477 DAG.getConstant(1, DL, MVT::i32));
9478
9479 if (VT == MVT::i64)
9480 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9481 return UaddLV;
9482 } else if (VT == MVT::i128) {
9483 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9484
9485 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9486 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9487 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9488 DAG.getConstant(0, DL, MVT::i64));
9489
9490 if (IsParity)
9491 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9492 DAG.getConstant(1, DL, MVT::i32));
9493
9494 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9495 }
9496
9497 assert(!IsParity && "ISD::PARITY of vector types not supported");
9498
9499 if (VT.isScalableVector() ||
9501 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9502
9503 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9504 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9505 "Unexpected type for custom ctpop lowering");
9506
9507 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9508 Val = DAG.getBitcast(VT8Bit, Val);
9509 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9510
9511 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9512 unsigned EltSize = 8;
9513 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9514 while (EltSize != VT.getScalarSizeInBits()) {
9515 EltSize *= 2;
9516 NumElts /= 2;
9517 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9518 Val = DAG.getNode(
9519 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9520 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9521 }
9522
9523 return Val;
9524}
9525
9526SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9527 EVT VT = Op.getValueType();
9528 assert(VT.isScalableVector() ||
9530 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9531
9532 SDLoc DL(Op);
9533 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9534 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9535}
9536
9537SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9538 SelectionDAG &DAG) const {
9539
9540 EVT VT = Op.getValueType();
9541 SDLoc DL(Op);
9542 unsigned Opcode = Op.getOpcode();
9544 switch (Opcode) {
9545 default:
9546 llvm_unreachable("Wrong instruction");
9547 case ISD::SMAX:
9548 CC = ISD::SETGT;
9549 break;
9550 case ISD::SMIN:
9551 CC = ISD::SETLT;
9552 break;
9553 case ISD::UMAX:
9554 CC = ISD::SETUGT;
9555 break;
9556 case ISD::UMIN:
9557 CC = ISD::SETULT;
9558 break;
9559 }
9560
9561 if (VT.isScalableVector() ||
9563 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9564 switch (Opcode) {
9565 default:
9566 llvm_unreachable("Wrong instruction");
9567 case ISD::SMAX:
9568 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9569 case ISD::SMIN:
9570 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9571 case ISD::UMAX:
9572 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9573 case ISD::UMIN:
9574 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9575 }
9576 }
9577
9578 SDValue Op0 = Op.getOperand(0);
9579 SDValue Op1 = Op.getOperand(1);
9580 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9581 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9582}
9583
9584SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9585 SelectionDAG &DAG) const {
9586 EVT VT = Op.getValueType();
9587
9588 if (VT.isScalableVector() ||
9590 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9591 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9592
9593 SDLoc DL(Op);
9594 SDValue REVB;
9595 MVT VST;
9596
9597 switch (VT.getSimpleVT().SimpleTy) {
9598 default:
9599 llvm_unreachable("Invalid type for bitreverse!");
9600
9601 case MVT::v2i32: {
9602 VST = MVT::v8i8;
9603 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9604
9605 break;
9606 }
9607
9608 case MVT::v4i32: {
9609 VST = MVT::v16i8;
9610 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9611
9612 break;
9613 }
9614
9615 case MVT::v1i64: {
9616 VST = MVT::v8i8;
9617 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9618
9619 break;
9620 }
9621
9622 case MVT::v2i64: {
9623 VST = MVT::v16i8;
9624 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9625
9626 break;
9627 }
9628 }
9629
9630 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9631 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9632}
9633
9634// Check whether the continuous comparison sequence.
9635static bool
9636isOrXorChain(SDValue N, unsigned &Num,
9637 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9638 if (Num == MaxXors)
9639 return false;
9640
9641 // Skip the one-use zext
9642 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9643 N = N->getOperand(0);
9644
9645 // The leaf node must be XOR
9646 if (N->getOpcode() == ISD::XOR) {
9647 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9648 Num++;
9649 return true;
9650 }
9651
9652 // All the non-leaf nodes must be OR.
9653 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9654 return false;
9655
9656 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9657 isOrXorChain(N->getOperand(1), Num, WorkList))
9658 return true;
9659 return false;
9660}
9661
9662// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9664 SDValue LHS = N->getOperand(0);
9665 SDValue RHS = N->getOperand(1);
9666 SDLoc DL(N);
9667 EVT VT = N->getValueType(0);
9669
9670 // Only handle integer compares.
9671 if (N->getOpcode() != ISD::SETCC)
9672 return SDValue();
9673
9674 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9675 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9676 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9677 unsigned NumXors = 0;
9678 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9679 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9680 isOrXorChain(LHS, NumXors, WorkList)) {
9681 SDValue XOR0, XOR1;
9682 std::tie(XOR0, XOR1) = WorkList[0];
9683 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9684 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9685 for (unsigned I = 1; I < WorkList.size(); I++) {
9686 std::tie(XOR0, XOR1) = WorkList[I];
9687 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9688 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9689 }
9690
9691 // Exit early by inverting the condition, which help reduce indentations.
9692 return Cmp;
9693 }
9694
9695 return SDValue();
9696}
9697
9698SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9699
9700 if (Op.getValueType().isVector())
9701 return LowerVSETCC(Op, DAG);
9702
9703 bool IsStrict = Op->isStrictFPOpcode();
9704 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9705 unsigned OpNo = IsStrict ? 1 : 0;
9706 SDValue Chain;
9707 if (IsStrict)
9708 Chain = Op.getOperand(0);
9709 SDValue LHS = Op.getOperand(OpNo + 0);
9710 SDValue RHS = Op.getOperand(OpNo + 1);
9711 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9712 SDLoc dl(Op);
9713
9714 // We chose ZeroOrOneBooleanContents, so use zero and one.
9715 EVT VT = Op.getValueType();
9716 SDValue TVal = DAG.getConstant(1, dl, VT);
9717 SDValue FVal = DAG.getConstant(0, dl, VT);
9718
9719 // Handle f128 first, since one possible outcome is a normal integer
9720 // comparison which gets picked up by the next if statement.
9721 if (LHS.getValueType() == MVT::f128) {
9722 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9723 IsSignaling);
9724
9725 // If softenSetCCOperands returned a scalar, use it.
9726 if (!RHS.getNode()) {
9727 assert(LHS.getValueType() == Op.getValueType() &&
9728 "Unexpected setcc expansion!");
9729 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9730 }
9731 }
9732
9733 if (LHS.getValueType().isInteger()) {
9734 SDValue CCVal;
9736 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9737
9738 // Note that we inverted the condition above, so we reverse the order of
9739 // the true and false operands here. This will allow the setcc to be
9740 // matched to a single CSINC instruction.
9741 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9742 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9743 }
9744
9745 // Now we know we're dealing with FP values.
9746 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9747 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9748
9749 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9750 // and do the comparison.
9751 SDValue Cmp;
9752 if (IsStrict)
9753 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9754 else
9755 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9756
9757 AArch64CC::CondCode CC1, CC2;
9758 changeFPCCToAArch64CC(CC, CC1, CC2);
9759 SDValue Res;
9760 if (CC2 == AArch64CC::AL) {
9761 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9762 CC2);
9763 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9764
9765 // Note that we inverted the condition above, so we reverse the order of
9766 // the true and false operands here. This will allow the setcc to be
9767 // matched to a single CSINC instruction.
9768 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9769 } else {
9770 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9771 // totally clean. Some of them require two CSELs to implement. As is in
9772 // this case, we emit the first CSEL and then emit a second using the output
9773 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9774
9775 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9776 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9777 SDValue CS1 =
9778 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9779
9780 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9781 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9782 }
9783 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9784}
9785
9786SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9787 SelectionDAG &DAG) const {
9788
9789 SDValue LHS = Op.getOperand(0);
9790 SDValue RHS = Op.getOperand(1);
9791 EVT VT = LHS.getValueType();
9792 if (VT != MVT::i32 && VT != MVT::i64)
9793 return SDValue();
9794
9795 SDLoc DL(Op);
9796 SDValue Carry = Op.getOperand(2);
9797 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9798 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9799 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9800 LHS, RHS, InvCarry);
9801
9802 EVT OpVT = Op.getValueType();
9803 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9804 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9805
9806 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9808 SDValue CCVal =
9809 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9810 // Inputs are swapped because the condition is inverted. This will allow
9811 // matching with a single CSINC instruction.
9812 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9813 Cmp.getValue(1));
9814}
9815
9816SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9817 SDValue RHS, SDValue TVal,
9818 SDValue FVal, const SDLoc &dl,
9819 SelectionDAG &DAG) const {
9820 // Handle f128 first, because it will result in a comparison of some RTLIB
9821 // call result against zero.
9822 if (LHS.getValueType() == MVT::f128) {
9823 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9824
9825 // If softenSetCCOperands returned a scalar, we need to compare the result
9826 // against zero to select between true and false values.
9827 if (!RHS.getNode()) {
9828 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9829 CC = ISD::SETNE;
9830 }
9831 }
9832
9833 // Also handle f16, for which we need to do a f32 comparison.
9834 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9835 LHS.getValueType() == MVT::bf16) {
9836 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9837 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9838 }
9839
9840 // Next, handle integers.
9841 if (LHS.getValueType().isInteger()) {
9842 assert((LHS.getValueType() == RHS.getValueType()) &&
9843 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9844
9845 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9846 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9847 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9848 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9849 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9850 // supported types.
9851 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9852 CTVal->isOne() && CFVal->isAllOnes() &&
9853 LHS.getValueType() == TVal.getValueType()) {
9854 EVT VT = LHS.getValueType();
9855 SDValue Shift =
9856 DAG.getNode(ISD::SRA, dl, VT, LHS,
9857 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9858 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9859 }
9860
9861 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9862 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9863 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9864 // Both require less instructions than compare and conditional select.
9865 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9866 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9867 LHS.getValueType() == RHS.getValueType()) {
9868 EVT VT = LHS.getValueType();
9869 SDValue Shift =
9870 DAG.getNode(ISD::SRA, dl, VT, LHS,
9871 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9872
9873 if (CC == ISD::SETGT)
9874 Shift = DAG.getNOT(dl, Shift, VT);
9875
9876 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9877 }
9878
9879 unsigned Opcode = AArch64ISD::CSEL;
9880
9881 // If both the TVal and the FVal are constants, see if we can swap them in
9882 // order to for a CSINV or CSINC out of them.
9883 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9884 std::swap(TVal, FVal);
9885 std::swap(CTVal, CFVal);
9886 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9887 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9888 std::swap(TVal, FVal);
9889 std::swap(CTVal, CFVal);
9890 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9891 } else if (TVal.getOpcode() == ISD::XOR) {
9892 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9893 // with a CSINV rather than a CSEL.
9894 if (isAllOnesConstant(TVal.getOperand(1))) {
9895 std::swap(TVal, FVal);
9896 std::swap(CTVal, CFVal);
9897 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9898 }
9899 } else if (TVal.getOpcode() == ISD::SUB) {
9900 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9901 // that we can match with a CSNEG rather than a CSEL.
9902 if (isNullConstant(TVal.getOperand(0))) {
9903 std::swap(TVal, FVal);
9904 std::swap(CTVal, CFVal);
9905 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9906 }
9907 } else if (CTVal && CFVal) {
9908 const int64_t TrueVal = CTVal->getSExtValue();
9909 const int64_t FalseVal = CFVal->getSExtValue();
9910 bool Swap = false;
9911
9912 // If both TVal and FVal are constants, see if FVal is the
9913 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9914 // instead of a CSEL in that case.
9915 if (TrueVal == ~FalseVal) {
9916 Opcode = AArch64ISD::CSINV;
9917 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9918 TrueVal == -FalseVal) {
9919 Opcode = AArch64ISD::CSNEG;
9920 } else if (TVal.getValueType() == MVT::i32) {
9921 // If our operands are only 32-bit wide, make sure we use 32-bit
9922 // arithmetic for the check whether we can use CSINC. This ensures that
9923 // the addition in the check will wrap around properly in case there is
9924 // an overflow (which would not be the case if we do the check with
9925 // 64-bit arithmetic).
9926 const uint32_t TrueVal32 = CTVal->getZExtValue();
9927 const uint32_t FalseVal32 = CFVal->getZExtValue();
9928
9929 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9930 Opcode = AArch64ISD::CSINC;
9931
9932 if (TrueVal32 > FalseVal32) {
9933 Swap = true;
9934 }
9935 }
9936 } else {
9937 // 64-bit check whether we can use CSINC.
9938 const uint64_t TrueVal64 = TrueVal;
9939 const uint64_t FalseVal64 = FalseVal;
9940
9941 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9942 Opcode = AArch64ISD::CSINC;
9943
9944 if (TrueVal > FalseVal) {
9945 Swap = true;
9946 }
9947 }
9948 }
9949
9950 // Swap TVal and FVal if necessary.
9951 if (Swap) {
9952 std::swap(TVal, FVal);
9953 std::swap(CTVal, CFVal);
9954 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9955 }
9956
9957 if (Opcode != AArch64ISD::CSEL) {
9958 // Drop FVal since we can get its value by simply inverting/negating
9959 // TVal.
9960 FVal = TVal;
9961 }
9962 }
9963
9964 // Avoid materializing a constant when possible by reusing a known value in
9965 // a register. However, don't perform this optimization if the known value
9966 // is one, zero or negative one in the case of a CSEL. We can always
9967 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9968 // FVal, respectively.
9969 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9970 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9971 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9973 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9974 // "a != C ? x : a" to avoid materializing C.
9975 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9976 TVal = LHS;
9977 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9978 FVal = LHS;
9979 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9980 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9981 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9982 // avoid materializing C.
9984 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9985 Opcode = AArch64ISD::CSINV;
9986 TVal = LHS;
9987 FVal = DAG.getConstant(0, dl, FVal.getValueType());
9988 }
9989 }
9990
9991 SDValue CCVal;
9992 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9993 EVT VT = TVal.getValueType();
9994 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
9995 }
9996
9997 // Now we know we're dealing with FP values.
9998 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9999 LHS.getValueType() == MVT::f64);
10000 assert(LHS.getValueType() == RHS.getValueType());
10001 EVT VT = TVal.getValueType();
10002 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10003
10004 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10005 // clean. Some of them require two CSELs to implement.
10006 AArch64CC::CondCode CC1, CC2;
10007 changeFPCCToAArch64CC(CC, CC1, CC2);
10008
10009 if (DAG.getTarget().Options.UnsafeFPMath) {
10010 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10011 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10012 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10013 if (RHSVal && RHSVal->isZero()) {
10014 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10015 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10016
10017 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10018 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10019 TVal = LHS;
10020 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10021 CFVal && CFVal->isZero() &&
10022 FVal.getValueType() == LHS.getValueType())
10023 FVal = LHS;
10024 }
10025 }
10026
10027 // Emit first, and possibly only, CSEL.
10028 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10029 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10030
10031 // If we need a second CSEL, emit it, using the output of the first as the
10032 // RHS. We're effectively OR'ing the two CC's together.
10033 if (CC2 != AArch64CC::AL) {
10034 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10035 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10036 }
10037
10038 // Otherwise, return the output of the first CSEL.
10039 return CS1;
10040}
10041
10042SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10043 SelectionDAG &DAG) const {
10044 EVT Ty = Op.getValueType();
10045 auto Idx = Op.getConstantOperandAPInt(2);
10046 int64_t IdxVal = Idx.getSExtValue();
10047 assert(Ty.isScalableVector() &&
10048 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10049
10050 // We can use the splice instruction for certain index values where we are
10051 // able to efficiently generate the correct predicate. The index will be
10052 // inverted and used directly as the input to the ptrue instruction, i.e.
10053 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10054 // splice predicate. However, we can only do this if we can guarantee that
10055 // there are enough elements in the vector, hence we check the index <= min
10056 // number of elements.
10057 std::optional<unsigned> PredPattern;
10058 if (Ty.isScalableVector() && IdxVal < 0 &&
10059 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10060 std::nullopt) {
10061 SDLoc DL(Op);
10062
10063 // Create a predicate where all but the last -IdxVal elements are false.
10064 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10065 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10066 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10067
10068 // Now splice the two inputs together using the predicate.
10069 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10070 Op.getOperand(1));
10071 }
10072
10073 // This will select to an EXT instruction, which has a maximum immediate
10074 // value of 255, hence 2048-bits is the maximum value we can lower.
10075 if (IdxVal >= 0 &&
10076 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
10077 return Op;
10078
10079 return SDValue();
10080}
10081
10082SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10083 SelectionDAG &DAG) const {
10084 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10085 SDValue LHS = Op.getOperand(0);
10086 SDValue RHS = Op.getOperand(1);
10087 SDValue TVal = Op.getOperand(2);
10088 SDValue FVal = Op.getOperand(3);
10089 SDLoc DL(Op);
10090 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10091}
10092
10093SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10094 SelectionDAG &DAG) const {
10095 SDValue CCVal = Op->getOperand(0);
10096 SDValue TVal = Op->getOperand(1);
10097 SDValue FVal = Op->getOperand(2);
10098 SDLoc DL(Op);
10099
10100 EVT Ty = Op.getValueType();
10101 if (Ty == MVT::aarch64svcount) {
10102 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10103 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10104 SDValue Sel =
10105 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10106 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10107 }
10108
10109 if (Ty.isScalableVector()) {
10110 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10111 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10112 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10113 }
10114
10115 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10116 // FIXME: Ideally this would be the same as above using i1 types, however
10117 // for the moment we can't deal with fixed i1 vector types properly, so
10118 // instead extend the predicate to a result type sized integer vector.
10119 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10120 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10121 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10122 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10123 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10124 }
10125
10126 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10127 // instruction.
10128 if (ISD::isOverflowIntrOpRes(CCVal)) {
10129 // Only lower legal XALUO ops.
10130 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10131 return SDValue();
10132
10134 SDValue Value, Overflow;
10135 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10136 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10137
10138 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10139 CCVal, Overflow);
10140 }
10141
10142 // Lower it the same way as we would lower a SELECT_CC node.
10144 SDValue LHS, RHS;
10145 if (CCVal.getOpcode() == ISD::SETCC) {
10146 LHS = CCVal.getOperand(0);
10147 RHS = CCVal.getOperand(1);
10148 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10149 } else {
10150 LHS = CCVal;
10151 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10152 CC = ISD::SETNE;
10153 }
10154
10155 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10156 // order to use FCSELSrrr
10157 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10158 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10159 DAG.getUNDEF(MVT::f32), TVal);
10160 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10161 DAG.getUNDEF(MVT::f32), FVal);
10162 }
10163
10164 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10165
10166 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10167 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10168 }
10169
10170 return Res;
10171}
10172
10173SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10174 SelectionDAG &DAG) const {
10175 // Jump table entries as PC relative offsets. No additional tweaking
10176 // is necessary here. Just get the address of the jump table.
10177 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10178
10181 !Subtarget->isTargetMachO())
10182 return getAddrLarge(JT, DAG);
10183 if (CM == CodeModel::Tiny)
10184 return getAddrTiny(JT, DAG);
10185 return getAddr(JT, DAG);
10186}
10187
10188SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10189 SelectionDAG &DAG) const {
10190 // Jump table entries as PC relative offsets. No additional tweaking
10191 // is necessary here. Just get the address of the jump table.
10192 SDLoc DL(Op);
10193 SDValue JT = Op.getOperand(1);
10194 SDValue Entry = Op.getOperand(2);
10195 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10196
10197 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10198 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10199
10200 SDNode *Dest =
10201 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10202 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10203 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10204 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10205}
10206
10207SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10208 SelectionDAG &DAG) const {
10209 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10211 if (CM == CodeModel::Large) {
10212 // Use the GOT for the large code model on iOS.
10213 if (Subtarget->isTargetMachO()) {
10214 return getGOT(CP, DAG);
10215 }
10217 return getAddrLarge(CP, DAG);
10218 } else if (CM == CodeModel::Tiny) {
10219 return getAddrTiny(CP, DAG);
10220 }
10221 return getAddr(CP, DAG);
10222}
10223
10224SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10225 SelectionDAG &DAG) const {
10226 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10228 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10230 return getAddrLarge(BA, DAG);
10231 } else if (CM == CodeModel::Tiny) {
10232 return getAddrTiny(BA, DAG);
10233 }
10234 return getAddr(BA, DAG);
10235}
10236
10237SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10238 SelectionDAG &DAG) const {
10239 AArch64FunctionInfo *FuncInfo =
10241
10242 SDLoc DL(Op);
10243 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10245 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10246 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10247 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10248 MachinePointerInfo(SV));
10249}
10250
10251SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10252 SelectionDAG &DAG) const {
10255
10256 SDLoc DL(Op);
10257 SDValue FR;
10258 if (Subtarget->isWindowsArm64EC()) {
10259 // With the Arm64EC ABI, we compute the address of the varargs save area
10260 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10261 // but calls from an entry thunk can pass in a different address.
10262 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10263 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10265 if (FuncInfo->getVarArgsGPRSize() > 0)
10266 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10267 else
10268 StackOffset = FuncInfo->getVarArgsStackOffset();
10269 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10270 DAG.getConstant(StackOffset, DL, MVT::i64));
10271 } else {
10272 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10273 ? FuncInfo->getVarArgsGPRIndex()
10274 : FuncInfo->getVarArgsStackIndex(),
10276 }
10277 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10278 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10279 MachinePointerInfo(SV));
10280}
10281
10282SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10283 SelectionDAG &DAG) const {
10284 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10285 // Standard, section B.3.
10288 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10289 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10290 auto PtrVT = getPointerTy(DAG.getDataLayout());
10291 SDLoc DL(Op);
10292
10293 SDValue Chain = Op.getOperand(0);
10294 SDValue VAList = Op.getOperand(1);
10295 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10297
10298 // void *__stack at offset 0
10299 unsigned Offset = 0;
10300 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10301 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10302 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10303 MachinePointerInfo(SV), Align(PtrSize)));
10304
10305 // void *__gr_top at offset 8 (4 on ILP32)
10306 Offset += PtrSize;
10307 int GPRSize = FuncInfo->getVarArgsGPRSize();
10308 if (GPRSize > 0) {
10309 SDValue GRTop, GRTopAddr;
10310
10311 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10312 DAG.getConstant(Offset, DL, PtrVT));
10313
10314 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10315 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10316 DAG.getConstant(GPRSize, DL, PtrVT));
10317 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10318
10319 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10321 Align(PtrSize)));
10322 }
10323
10324 // void *__vr_top at offset 16 (8 on ILP32)
10325 Offset += PtrSize;
10326 int FPRSize = FuncInfo->getVarArgsFPRSize();
10327 if (FPRSize > 0) {
10328 SDValue VRTop, VRTopAddr;
10329 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10330 DAG.getConstant(Offset, DL, PtrVT));
10331
10332 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10333 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10334 DAG.getConstant(FPRSize, DL, PtrVT));
10335 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10336
10337 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10339 Align(PtrSize)));
10340 }
10341
10342 // int __gr_offs at offset 24 (12 on ILP32)
10343 Offset += PtrSize;
10344 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10345 DAG.getConstant(Offset, DL, PtrVT));
10346 MemOps.push_back(
10347 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10348 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10349
10350 // int __vr_offs at offset 28 (16 on ILP32)
10351 Offset += 4;
10352 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10353 DAG.getConstant(Offset, DL, PtrVT));
10354 MemOps.push_back(
10355 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10356 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10357
10358 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10359}
10360
10361SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10362 SelectionDAG &DAG) const {
10364
10365 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10366 return LowerWin64_VASTART(Op, DAG);
10367 else if (Subtarget->isTargetDarwin())
10368 return LowerDarwin_VASTART(Op, DAG);
10369 else
10370 return LowerAAPCS_VASTART(Op, DAG);
10371}
10372
10373SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10374 SelectionDAG &DAG) const {
10375 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10376 // pointer.
10377 SDLoc DL(Op);
10378 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10379 unsigned VaListSize =
10380 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10381 ? PtrSize
10382 : Subtarget->isTargetILP32() ? 20 : 32;
10383 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10384 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10385
10386 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10387 DAG.getConstant(VaListSize, DL, MVT::i32),
10388 Align(PtrSize), false, false, false,
10389 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10390}
10391
10392SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10393 assert(Subtarget->isTargetDarwin() &&
10394 "automatic va_arg instruction only works on Darwin");
10395
10396 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10397 EVT VT = Op.getValueType();
10398 SDLoc DL(Op);
10399 SDValue Chain = Op.getOperand(0);
10400 SDValue Addr = Op.getOperand(1);
10401 MaybeAlign Align(Op.getConstantOperandVal(3));
10402 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10403 auto PtrVT = getPointerTy(DAG.getDataLayout());
10404 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10405 SDValue VAList =
10406 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10407 Chain = VAList.getValue(1);
10408 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10409
10410 if (VT.isScalableVector())
10411 report_fatal_error("Passing SVE types to variadic functions is "
10412 "currently not supported");
10413
10414 if (Align && *Align > MinSlotSize) {
10415 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10416 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10417 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10418 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10419 }
10420
10421 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10422 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10423
10424 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10425 // up to 64 bits. At the very least, we have to increase the striding of the
10426 // vaargs list to match this, and for FP values we need to introduce
10427 // FP_ROUND nodes as well.
10428 if (VT.isInteger() && !VT.isVector())
10429 ArgSize = std::max(ArgSize, MinSlotSize);
10430 bool NeedFPTrunc = false;
10431 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10432 ArgSize = 8;
10433 NeedFPTrunc = true;
10434 }
10435
10436 // Increment the pointer, VAList, to the next vaarg
10437 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10438 DAG.getConstant(ArgSize, DL, PtrVT));
10439 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10440
10441 // Store the incremented VAList to the legalized pointer
10442 SDValue APStore =
10443 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10444
10445 // Load the actual argument out of the pointer VAList
10446 if (NeedFPTrunc) {
10447 // Load the value as an f64.
10448 SDValue WideFP =
10449 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10450 // Round the value down to an f32.
10451 SDValue NarrowFP =
10452 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10453 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10454 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10455 // Merge the rounded value with the chain output of the load.
10456 return DAG.getMergeValues(Ops, DL);
10457 }
10458
10459 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10460}
10461
10462SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10463 SelectionDAG &DAG) const {
10465 MFI.setFrameAddressIsTaken(true);
10466
10467 EVT VT = Op.getValueType();
10468 SDLoc DL(Op);
10469 unsigned Depth = Op.getConstantOperandVal(0);
10470 SDValue FrameAddr =
10471 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10472 while (Depth--)
10473 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10475
10476 if (Subtarget->isTargetILP32())
10477 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10478 DAG.getValueType(VT));
10479
10480 return FrameAddr;
10481}
10482
10483SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10484 SelectionDAG &DAG) const {
10486
10487 EVT VT = getPointerTy(DAG.getDataLayout());
10488 SDLoc DL(Op);
10489 int FI = MFI.CreateFixedObject(4, 0, false);
10490 return DAG.getFrameIndex(FI, VT);
10491}
10492
10493#define GET_REGISTER_MATCHER
10494#include "AArch64GenAsmMatcher.inc"
10495
10496// FIXME? Maybe this could be a TableGen attribute on some registers and
10497// this table could be generated automatically from RegInfo.
10498Register AArch64TargetLowering::
10499getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10501 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10502 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10503 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10504 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10505 !MRI->isReservedReg(MF, Reg))
10506 Reg = 0;
10507 }
10508 if (Reg)
10509 return Reg;
10510 report_fatal_error(Twine("Invalid register name \""
10511 + StringRef(RegName) + "\"."));
10512}
10513
10514SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10515 SelectionDAG &DAG) const {
10517
10518 EVT VT = Op.getValueType();
10519 SDLoc DL(Op);
10520
10521 SDValue FrameAddr =
10522 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10524
10525 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10526}
10527
10528SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10529 SelectionDAG &DAG) const {
10531 MachineFrameInfo &MFI = MF.getFrameInfo();
10532 MFI.setReturnAddressIsTaken(true);
10533
10534 EVT VT = Op.getValueType();
10535 SDLoc DL(Op);
10536 unsigned Depth = Op.getConstantOperandVal(0);
10537 SDValue ReturnAddress;
10538 if (Depth) {
10539 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10541 ReturnAddress = DAG.getLoad(
10542 VT, DL, DAG.getEntryNode(),
10543 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10544 } else {
10545 // Return LR, which contains the return address. Mark it an implicit
10546 // live-in.
10547 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10548 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10549 }
10550
10551 // The XPACLRI instruction assembles to a hint-space instruction before
10552 // Armv8.3-A therefore this instruction can be safely used for any pre
10553 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10554 // that instead.
10555 SDNode *St;
10556 if (Subtarget->hasPAuth()) {
10557 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10558 } else {
10559 // XPACLRI operates on LR therefore we must move the operand accordingly.
10560 SDValue Chain =
10561 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10562 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10563 }
10564 return SDValue(St, 0);
10565}
10566
10567/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10568/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10569SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10570 SelectionDAG &DAG) const {
10571 SDValue Lo, Hi;
10572 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10573 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10574}
10575
10577 const GlobalAddressSDNode *GA) const {
10578 // Offsets are folded in the DAG combine rather than here so that we can
10579 // intelligently choose an offset based on the uses.
10580 return false;
10581}
10582
10584 bool OptForSize) const {
10585 bool IsLegal = false;
10586 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10587 // 16-bit case when target has full fp16 support.
10588 // We encode bf16 bit patterns as if they were fp16. This results in very
10589 // strange looking assembly but should populate the register with appropriate
10590 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10591 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10592 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10593 // FIXME: We should be able to handle f128 as well with a clever lowering.
10594 const APInt ImmInt = Imm.bitcastToAPInt();
10595 if (VT == MVT::f64)
10596 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10597 else if (VT == MVT::f32)
10598 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10599 else if (VT == MVT::f16 || VT == MVT::bf16)
10600 IsLegal =
10601 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10602 Imm.isPosZero();
10603
10604 // If we can not materialize in immediate field for fmov, check if the
10605 // value can be encoded as the immediate operand of a logical instruction.
10606 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10607 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10608 // generate that fmov.
10609 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10610 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10611 // however the mov+fmov sequence is always better because of the reduced
10612 // cache pressure. The timings are still the same if you consider
10613 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10614 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10617 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10618 IsLegal = Insn.size() <= Limit;
10619 }
10620
10621 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10622 << " imm value: "; Imm.dump(););
10623 return IsLegal;
10624}
10625
10626//===----------------------------------------------------------------------===//
10627// AArch64 Optimization Hooks
10628//===----------------------------------------------------------------------===//
10629
10630static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10631 SDValue Operand, SelectionDAG &DAG,
10632 int &ExtraSteps) {
10633 EVT VT = Operand.getValueType();
10634 if ((ST->hasNEON() &&
10635 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10636 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10637 VT == MVT::v4f32)) ||
10638 (ST->hasSVE() &&
10639 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10641 // For the reciprocal estimates, convergence is quadratic, so the number
10642 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10643 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10644 // the result for float (23 mantissa bits) is 2 and for double (52
10645 // mantissa bits) is 3.
10646 constexpr unsigned AccurateBits = 8;
10647 unsigned DesiredBits =
10649 ExtraSteps = DesiredBits <= AccurateBits
10650 ? 0
10651 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10652 }
10653
10654 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10655 }
10656
10657 return SDValue();
10658}
10659
10660SDValue
10661AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10662 const DenormalMode &Mode) const {
10663 SDLoc DL(Op);
10664 EVT VT = Op.getValueType();
10665 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10666 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10667 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10668}
10669
10670SDValue
10671AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10672 SelectionDAG &DAG) const {
10673 return Op;
10674}
10675
10676SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10677 SelectionDAG &DAG, int Enabled,
10678 int &ExtraSteps,
10679 bool &UseOneConst,
10680 bool Reciprocal) const {
10682 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10683 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10684 DAG, ExtraSteps)) {
10685 SDLoc DL(Operand);
10686 EVT VT = Operand.getValueType();
10687
10689 Flags.setAllowReassociation(true);
10690
10691 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10692 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10693 for (int i = ExtraSteps; i > 0; --i) {
10694 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10695 Flags);
10696 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10697 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10698 }
10699 if (!Reciprocal)
10700 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10701
10702 ExtraSteps = 0;
10703 return Estimate;
10704 }
10705
10706 return SDValue();
10707}
10708
10709SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10710 SelectionDAG &DAG, int Enabled,
10711 int &ExtraSteps) const {
10713 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10714 DAG, ExtraSteps)) {
10715 SDLoc DL(Operand);
10716 EVT VT = Operand.getValueType();
10717
10719 Flags.setAllowReassociation(true);
10720
10721 // Newton reciprocal iteration: E * (2 - X * E)
10722 // AArch64 reciprocal iteration instruction: (2 - M * N)
10723 for (int i = ExtraSteps; i > 0; --i) {
10724 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10725 Estimate, Flags);
10726 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10727 }
10728
10729 ExtraSteps = 0;
10730 return Estimate;
10731 }
10732
10733 return SDValue();
10734}
10735
10736//===----------------------------------------------------------------------===//
10737// AArch64 Inline Assembly Support
10738//===----------------------------------------------------------------------===//
10739
10740// Table of Constraints
10741// TODO: This is the current set of constraints supported by ARM for the
10742// compiler, not all of them may make sense.
10743//
10744// r - A general register
10745// w - An FP/SIMD register of some size in the range v0-v31
10746// x - An FP/SIMD register of some size in the range v0-v15
10747// I - Constant that can be used with an ADD instruction
10748// J - Constant that can be used with a SUB instruction
10749// K - Constant that can be used with a 32-bit logical instruction
10750// L - Constant that can be used with a 64-bit logical instruction
10751// M - Constant that can be used as a 32-bit MOV immediate
10752// N - Constant that can be used as a 64-bit MOV immediate
10753// Q - A memory reference with base register and no offset
10754// S - A symbolic address
10755// Y - Floating point constant zero
10756// Z - Integer constant zero
10757//
10758// Note that general register operands will be output using their 64-bit x
10759// register name, whatever the size of the variable, unless the asm operand
10760// is prefixed by the %w modifier. Floating-point and SIMD register operands
10761// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10762// %q modifier.
10763const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10764 // At this point, we have to lower this constraint to something else, so we
10765 // lower it to an "r" or "w". However, by doing this we will force the result
10766 // to be in register, while the X constraint is much more permissive.
10767 //
10768 // Although we are correct (we are free to emit anything, without
10769 // constraints), we might break use cases that would expect us to be more
10770 // efficient and emit something else.
10771 if (!Subtarget->hasFPARMv8())
10772 return "r";
10773
10774 if (ConstraintVT.isFloatingPoint())
10775 return "w";
10776
10777 if (ConstraintVT.isVector() &&
10778 (ConstraintVT.getSizeInBits() == 64 ||
10779 ConstraintVT.getSizeInBits() == 128))
10780 return "w";
10781
10782 return "r";
10783}
10784
10786
10787static std::optional<PredicateConstraint>
10790 .Case("Uph", PredicateConstraint::Uph)
10791 .Case("Upl", PredicateConstraint::Upl)
10792 .Case("Upa", PredicateConstraint::Upa)
10793 .Default(std::nullopt);
10794}
10795
10796static const TargetRegisterClass *
10798 if (VT != MVT::aarch64svcount &&
10799 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10800 return nullptr;
10801
10802 switch (Constraint) {
10803 case PredicateConstraint::Uph:
10804 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10805 : &AArch64::PPR_p8to15RegClass;
10806 case PredicateConstraint::Upl:
10807 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10808 : &AArch64::PPR_3bRegClass;
10809 case PredicateConstraint::Upa:
10810 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10811 : &AArch64::PPRRegClass;
10812 }
10813
10814 llvm_unreachable("Missing PredicateConstraint!");
10815}
10816
10818
10819static std::optional<ReducedGprConstraint>
10822 .Case("Uci", ReducedGprConstraint::Uci)
10823 .Case("Ucj", ReducedGprConstraint::Ucj)
10824 .Default(std::nullopt);
10825}
10826
10827static const TargetRegisterClass *
10829 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10830 return nullptr;
10831
10832 switch (Constraint) {
10833 case ReducedGprConstraint::Uci:
10834 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10835 case ReducedGprConstraint::Ucj:
10836 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10837 }
10838
10839 llvm_unreachable("Missing ReducedGprConstraint!");
10840}
10841
10842// The set of cc code supported is from
10843// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10846 .Case("{@cchi}", AArch64CC::HI)
10847 .Case("{@cccs}", AArch64CC::HS)
10848 .Case("{@cclo}", AArch64CC::LO)
10849 .Case("{@ccls}", AArch64CC::LS)
10850 .Case("{@cccc}", AArch64CC::LO)
10851 .Case("{@cceq}", AArch64CC::EQ)
10852 .Case("{@ccgt}", AArch64CC::GT)
10853 .Case("{@ccge}", AArch64CC::GE)
10854 .Case("{@cclt}", AArch64CC::LT)
10855 .Case("{@ccle}", AArch64CC::LE)
10856 .Case("{@cchs}", AArch64CC::HS)
10857 .Case("{@ccne}", AArch64CC::NE)
10858 .Case("{@ccvc}", AArch64CC::VC)
10859 .Case("{@ccpl}", AArch64CC::PL)
10860 .Case("{@ccvs}", AArch64CC::VS)
10861 .Case("{@ccmi}", AArch64CC::MI)
10863 return Cond;
10864}
10865
10866/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10867/// WZR, invert(<cond>)'.
10869 SelectionDAG &DAG) {
10870 return DAG.getNode(
10871 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10872 DAG.getConstant(0, DL, MVT::i32),
10873 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10874}
10875
10876// Lower @cc flag output via getSETCC.
10877SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10878 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10879 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10880 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10881 if (Cond == AArch64CC::Invalid)
10882 return SDValue();
10883 // The output variable should be a scalar integer.
10884 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10885 OpInfo.ConstraintVT.getSizeInBits() < 8)
10886 report_fatal_error("Flag output operand is of invalid type");
10887
10888 // Get NZCV register. Only update chain when copyfrom is glued.
10889 if (Glue.getNode()) {
10890 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10891 Chain = Glue.getValue(1);
10892 } else
10893 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10894 // Extract CC code.
10895 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10896
10898
10899 // Truncate or ZERO_EXTEND based on value types.
10900 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10901 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10902 else
10903 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10904
10905 return Result;
10906}
10907
10908/// getConstraintType - Given a constraint letter, return the type of
10909/// constraint it is for this target.
10911AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10912 if (Constraint.size() == 1) {
10913 switch (Constraint[0]) {
10914 default:
10915 break;
10916 case 'x':
10917 case 'w':
10918 case 'y':
10919 return C_RegisterClass;
10920 // An address with a single base register. Due to the way we
10921 // currently handle addresses it is the same as 'r'.
10922 case 'Q':
10923 return C_Memory;
10924 case 'I':
10925 case 'J':
10926 case 'K':
10927 case 'L':
10928 case 'M':
10929 case 'N':
10930 case 'Y':
10931 case 'Z':
10932 return C_Immediate;
10933 case 'z':
10934 case 'S': // A symbol or label reference with a constant offset
10935 return C_Other;
10936 }
10937 } else if (parsePredicateConstraint(Constraint))
10938 return C_RegisterClass;
10939 else if (parseReducedGprConstraint(Constraint))
10940 return C_RegisterClass;
10941 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10942 return C_Other;
10943 return TargetLowering::getConstraintType(Constraint);
10944}
10945
10946/// Examine constraint type and operand type and determine a weight value.
10947/// This object must already have been set up with the operand type
10948/// and the current alternative constraint selected.
10950AArch64TargetLowering::getSingleConstraintMatchWeight(
10951 AsmOperandInfo &info, const char *constraint) const {
10953 Value *CallOperandVal = info.CallOperandVal;
10954 // If we don't have a value, we can't do a match,
10955 // but allow it at the lowest weight.
10956 if (!CallOperandVal)
10957 return CW_Default;
10958 Type *type = CallOperandVal->getType();
10959 // Look at the constraint type.
10960 switch (*constraint) {
10961 default:
10963 break;
10964 case 'x':
10965 case 'w':
10966 case 'y':
10967 if (type->isFloatingPointTy() || type->isVectorTy())
10968 weight = CW_Register;
10969 break;
10970 case 'z':
10971 weight = CW_Constant;
10972 break;
10973 case 'U':
10974 if (parsePredicateConstraint(constraint) ||
10975 parseReducedGprConstraint(constraint))
10976 weight = CW_Register;
10977 break;
10978 }
10979 return weight;
10980}
10981
10982std::pair<unsigned, const TargetRegisterClass *>
10983AArch64TargetLowering::getRegForInlineAsmConstraint(
10984 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10985 if (Constraint.size() == 1) {
10986 switch (Constraint[0]) {
10987 case 'r':
10988 if (VT.isScalableVector())
10989 return std::make_pair(0U, nullptr);
10990 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10991 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10992 if (VT.getFixedSizeInBits() == 64)
10993 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10994 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10995 case 'w': {
10996 if (!Subtarget->hasFPARMv8())
10997 break;
10998 if (VT.isScalableVector()) {
10999 if (VT.getVectorElementType() != MVT::i1)
11000 return std::make_pair(0U, &AArch64::ZPRRegClass);
11001 return std::make_pair(0U, nullptr);
11002 }
11003 uint64_t VTSize = VT.getFixedSizeInBits();
11004 if (VTSize == 16)
11005 return std::make_pair(0U, &AArch64::FPR16RegClass);
11006 if (VTSize == 32)
11007 return std::make_pair(0U, &AArch64::FPR32RegClass);
11008 if (VTSize == 64)
11009 return std::make_pair(0U, &AArch64::FPR64RegClass);
11010 if (VTSize == 128)
11011 return std::make_pair(0U, &AArch64::FPR128RegClass);
11012 break;
11013 }
11014 // The instructions that this constraint is designed for can
11015 // only take 128-bit registers so just use that regclass.
11016 case 'x':
11017 if (!Subtarget->hasFPARMv8())
11018 break;
11019 if (VT.isScalableVector())
11020 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11021 if (VT.getSizeInBits() == 128)
11022 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11023 break;
11024 case 'y':
11025 if (!Subtarget->hasFPARMv8())
11026 break;
11027 if (VT.isScalableVector())
11028 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11029 break;
11030 }
11031 } else {
11032 if (const auto PC = parsePredicateConstraint(Constraint))
11033 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11034 return std::make_pair(0U, RegClass);
11035
11036 if (const auto RGC = parseReducedGprConstraint(Constraint))
11037 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11038 return std::make_pair(0U, RegClass);
11039 }
11040 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11042 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11043
11044 if (Constraint == "{za}") {
11045 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11046 }
11047
11048 if (Constraint == "{zt0}") {
11049 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11050 }
11051
11052 // Use the default implementation in TargetLowering to convert the register
11053 // constraint into a member of a register class.
11054 std::pair<unsigned, const TargetRegisterClass *> Res;
11056
11057 // Not found as a standard register?
11058 if (!Res.second) {
11059 unsigned Size = Constraint.size();
11060 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11061 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11062 int RegNo;
11063 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11064 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11065 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11066 // By default we'll emit v0-v31 for this unless there's a modifier where
11067 // we'll emit the correct register as well.
11068 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11069 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11070 Res.second = &AArch64::FPR64RegClass;
11071 } else {
11072 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11073 Res.second = &AArch64::FPR128RegClass;
11074 }
11075 }
11076 }
11077 }
11078
11079 if (Res.second && !Subtarget->hasFPARMv8() &&
11080 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11081 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11082 return std::make_pair(0U, nullptr);
11083
11084 return Res;
11085}
11086
11088 llvm::Type *Ty,
11089 bool AllowUnknown) const {
11090 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11091 return EVT(MVT::i64x8);
11092
11093 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11094}
11095
11096/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11097/// vector. If it is invalid, don't add anything to Ops.
11098void AArch64TargetLowering::LowerAsmOperandForConstraint(
11099 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11100 SelectionDAG &DAG) const {
11101 SDValue Result;
11102
11103 // Currently only support length 1 constraints.
11104 if (Constraint.size() != 1)
11105 return;
11106
11107 char ConstraintLetter = Constraint[0];
11108 switch (ConstraintLetter) {
11109 default:
11110 break;
11111
11112 // This set of constraints deal with valid constants for various instructions.
11113 // Validate and return a target constant for them if we can.
11114 case 'z': {
11115 // 'z' maps to xzr or wzr so it needs an input of 0.
11116 if (!isNullConstant(Op))
11117 return;
11118
11119 if (Op.getValueType() == MVT::i64)
11120 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11121 else
11122 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11123 break;
11124 }
11125 case 'S':
11126 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11127 // supported for PIC while "s" isn't, making "s" less useful. We implement
11128 // "S" but not "s".
11130 break;
11131
11132 case 'I':
11133 case 'J':
11134 case 'K':
11135 case 'L':
11136 case 'M':
11137 case 'N':
11138 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11139 if (!C)
11140 return;
11141
11142 // Grab the value and do some validation.
11143 uint64_t CVal = C->getZExtValue();
11144 switch (ConstraintLetter) {
11145 // The I constraint applies only to simple ADD or SUB immediate operands:
11146 // i.e. 0 to 4095 with optional shift by 12
11147 // The J constraint applies only to ADD or SUB immediates that would be
11148 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11149 // instruction [or vice versa], in other words -1 to -4095 with optional
11150 // left shift by 12.
11151 case 'I':
11152 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11153 break;
11154 return;
11155 case 'J': {
11156 uint64_t NVal = -C->getSExtValue();
11157 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11158 CVal = C->getSExtValue();
11159 break;
11160 }
11161 return;
11162 }
11163 // The K and L constraints apply *only* to logical immediates, including
11164 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11165 // been removed and MOV should be used). So these constraints have to
11166 // distinguish between bit patterns that are valid 32-bit or 64-bit
11167 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11168 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11169 // versa.
11170 case 'K':
11171 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11172 break;
11173 return;
11174 case 'L':
11175 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11176 break;
11177 return;
11178 // The M and N constraints are a superset of K and L respectively, for use
11179 // with the MOV (immediate) alias. As well as the logical immediates they
11180 // also match 32 or 64-bit immediates that can be loaded either using a
11181 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11182 // (M) or 64-bit 0x1234000000000000 (N) etc.
11183 // As a note some of this code is liberally stolen from the asm parser.
11184 case 'M': {
11185 if (!isUInt<32>(CVal))
11186 return;
11187 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11188 break;
11189 if ((CVal & 0xFFFF) == CVal)
11190 break;
11191 if ((CVal & 0xFFFF0000ULL) == CVal)
11192 break;
11193 uint64_t NCVal = ~(uint32_t)CVal;
11194 if ((NCVal & 0xFFFFULL) == NCVal)
11195 break;
11196 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11197 break;
11198 return;
11199 }
11200 case 'N': {
11201 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11202 break;
11203 if ((CVal & 0xFFFFULL) == CVal)
11204 break;
11205 if ((CVal & 0xFFFF0000ULL) == CVal)
11206 break;
11207 if ((CVal & 0xFFFF00000000ULL) == CVal)
11208 break;
11209 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11210 break;
11211 uint64_t NCVal = ~CVal;
11212 if ((NCVal & 0xFFFFULL) == NCVal)
11213 break;
11214 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11215 break;
11216 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11217 break;
11218 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11219 break;
11220 return;
11221 }
11222 default:
11223 return;
11224 }
11225
11226 // All assembler immediates are 64-bit integers.
11227 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11228 break;
11229 }
11230
11231 if (Result.getNode()) {
11232 Ops.push_back(Result);
11233 return;
11234 }
11235
11236 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11237}
11238
11239//===----------------------------------------------------------------------===//
11240// AArch64 Advanced SIMD Support
11241//===----------------------------------------------------------------------===//
11242
11243/// WidenVector - Given a value in the V64 register class, produce the
11244/// equivalent value in the V128 register class.
11246 EVT VT = V64Reg.getValueType();
11247 unsigned NarrowSize = VT.getVectorNumElements();
11248 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11249 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11250 SDLoc DL(V64Reg);
11251
11252 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11253 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11254}
11255
11256/// getExtFactor - Determine the adjustment factor for the position when
11257/// generating an "extract from vector registers" instruction.
11258static unsigned getExtFactor(SDValue &V) {
11259 EVT EltType = V.getValueType().getVectorElementType();
11260 return EltType.getSizeInBits() / 8;
11261}
11262
11263// Check if a vector is built from one vector via extracted elements of
11264// another together with an AND mask, ensuring that all elements fit
11265// within range. This can be reconstructed using AND and NEON's TBL1.
11267 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11268 SDLoc dl(Op);
11269 EVT VT = Op.getValueType();
11270 assert(!VT.isScalableVector() &&
11271 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11272
11273 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11274 // directly to TBL1.
11275 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11276 return SDValue();
11277
11278 unsigned NumElts = VT.getVectorNumElements();
11279 assert((NumElts == 8 || NumElts == 16) &&
11280 "Need to have exactly 8 or 16 elements in vector.");
11281
11282 SDValue SourceVec;
11283 SDValue MaskSourceVec;
11284 SmallVector<SDValue, 16> AndMaskConstants;
11285
11286 for (unsigned i = 0; i < NumElts; ++i) {
11287 SDValue V = Op.getOperand(i);
11288 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11289 return SDValue();
11290
11291 SDValue OperandSourceVec = V.getOperand(0);
11292 if (!SourceVec)
11293 SourceVec = OperandSourceVec;
11294 else if (SourceVec != OperandSourceVec)
11295 return SDValue();
11296
11297 // This only looks at shuffles with elements that are
11298 // a) truncated by a constant AND mask extracted from a mask vector, or
11299 // b) extracted directly from a mask vector.
11300 SDValue MaskSource = V.getOperand(1);
11301 if (MaskSource.getOpcode() == ISD::AND) {
11302 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11303 return SDValue();
11304
11305 AndMaskConstants.push_back(MaskSource.getOperand(1));
11306 MaskSource = MaskSource->getOperand(0);
11307 } else if (!AndMaskConstants.empty()) {
11308 // Either all or no operands should have an AND mask.
11309 return SDValue();
11310 }
11311
11312 // An ANY_EXTEND may be inserted between the AND and the source vector
11313 // extraction. We don't care about that, so we can just skip it.
11314 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11315 MaskSource = MaskSource.getOperand(0);
11316
11317 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11318 return SDValue();
11319
11320 SDValue MaskIdx = MaskSource.getOperand(1);
11321 if (!isa<ConstantSDNode>(MaskIdx) ||
11322 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11323 return SDValue();
11324
11325 // We only apply this if all elements come from the same vector with the
11326 // same vector type.
11327 if (!MaskSourceVec) {
11328 MaskSourceVec = MaskSource->getOperand(0);
11329 if (MaskSourceVec.getValueType() != VT)
11330 return SDValue();
11331 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11332 return SDValue();
11333 }
11334 }
11335
11336 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11337 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11338 // insert, we know that the index in the mask must be smaller than the number
11339 // of elements in the source, or we would have an out-of-bounds access.
11340 if (NumElts == 8)
11341 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11342 DAG.getUNDEF(VT));
11343
11344 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11345 if (!AndMaskConstants.empty())
11346 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11347 DAG.getBuildVector(VT, dl, AndMaskConstants));
11348
11349 return DAG.getNode(
11351 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11352 MaskSourceVec);
11353}
11354
11355// Gather data to see if the operation can be modelled as a
11356// shuffle in combination with VEXTs.
11358 SelectionDAG &DAG) const {
11359 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11360 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11361 SDLoc dl(Op);
11362 EVT VT = Op.getValueType();
11363 assert(!VT.isScalableVector() &&
11364 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11365 unsigned NumElts = VT.getVectorNumElements();
11366
11367 struct ShuffleSourceInfo {
11368 SDValue Vec;
11369 unsigned MinElt;
11370 unsigned MaxElt;
11371
11372 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11373 // be compatible with the shuffle we intend to construct. As a result
11374 // ShuffleVec will be some sliding window into the original Vec.
11375 SDValue ShuffleVec;
11376
11377 // Code should guarantee that element i in Vec starts at element "WindowBase
11378 // + i * WindowScale in ShuffleVec".
11379 int WindowBase;
11380 int WindowScale;
11381
11382 ShuffleSourceInfo(SDValue Vec)
11383 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11384 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11385
11386 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11387 };
11388
11389 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11390 // node.
11392 for (unsigned i = 0; i < NumElts; ++i) {
11393 SDValue V = Op.getOperand(i);
11394 if (V.isUndef())
11395 continue;
11396 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11397 !isa<ConstantSDNode>(V.getOperand(1)) ||
11398 V.getOperand(0).getValueType().isScalableVector()) {
11399 LLVM_DEBUG(
11400 dbgs() << "Reshuffle failed: "
11401 "a shuffle can only come from building a vector from "
11402 "various elements of other fixed-width vectors, provided "
11403 "their indices are constant\n");
11404 return SDValue();
11405 }
11406
11407 // Add this element source to the list if it's not already there.
11408 SDValue SourceVec = V.getOperand(0);
11409 auto Source = find(Sources, SourceVec);
11410 if (Source == Sources.end())
11411 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11412
11413 // Update the minimum and maximum lane number seen.
11414 unsigned EltNo = V.getConstantOperandVal(1);
11415 Source->MinElt = std::min(Source->MinElt, EltNo);
11416 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11417 }
11418
11419 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11420 // better than moving to/from gpr registers for larger vectors.
11421 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11422 // Construct a mask for the tbl. We may need to adjust the index for types
11423 // larger than i8.
11425 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11426 for (unsigned I = 0; I < NumElts; ++I) {
11427 SDValue V = Op.getOperand(I);
11428 if (V.isUndef()) {
11429 for (unsigned OF = 0; OF < OutputFactor; OF++)
11430 Mask.push_back(-1);
11431 continue;
11432 }
11433 // Set the Mask lanes adjusted for the size of the input and output
11434 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11435 // output element, adjusted in their positions per input and output types.
11436 unsigned Lane = V.getConstantOperandVal(1);
11437 for (unsigned S = 0; S < Sources.size(); S++) {
11438 if (V.getOperand(0) == Sources[S].Vec) {
11439 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11440 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11441 for (unsigned OF = 0; OF < OutputFactor; OF++)
11442 Mask.push_back(InputBase + OF);
11443 break;
11444 }
11445 }
11446 }
11447
11448 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11449 // v16i8, and the TBLMask
11450 SmallVector<SDValue, 16> TBLOperands;
11451 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11452 ? Intrinsic::aarch64_neon_tbl3
11453 : Intrinsic::aarch64_neon_tbl4,
11454 dl, MVT::i32));
11455 for (unsigned i = 0; i < Sources.size(); i++) {
11456 SDValue Src = Sources[i].Vec;
11457 EVT SrcVT = Src.getValueType();
11458 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11459 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11460 "Expected a legally typed vector");
11461 if (SrcVT.is64BitVector())
11462 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11463 DAG.getUNDEF(MVT::v8i8));
11464 TBLOperands.push_back(Src);
11465 }
11466
11468 for (unsigned i = 0; i < Mask.size(); i++)
11469 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11470 assert((Mask.size() == 8 || Mask.size() == 16) &&
11471 "Expected a v8i8 or v16i8 Mask");
11472 TBLOperands.push_back(
11473 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11474
11475 SDValue Shuffle =
11477 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11478 return DAG.getBitcast(VT, Shuffle);
11479 }
11480
11481 if (Sources.size() > 2) {
11482 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11483 << "sensible when at most two source vectors are "
11484 << "involved\n");
11485 return SDValue();
11486 }
11487
11488 // Find out the smallest element size among result and two sources, and use
11489 // it as element size to build the shuffle_vector.
11490 EVT SmallestEltTy = VT.getVectorElementType();
11491 for (auto &Source : Sources) {
11492 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11493 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11494 SmallestEltTy = SrcEltTy;
11495 }
11496 }
11497 unsigned ResMultiplier =
11498 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11499 uint64_t VTSize = VT.getFixedSizeInBits();
11500 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11501 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11502
11503 // If the source vector is too wide or too narrow, we may nevertheless be able
11504 // to construct a compatible shuffle either by concatenating it with UNDEF or
11505 // extracting a suitable range of elements.
11506 for (auto &Src : Sources) {
11507 EVT SrcVT = Src.ShuffleVec.getValueType();
11508
11509 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11510 if (SrcVTSize == TypeSize::getFixed(VTSize))
11511 continue;
11512
11513 // This stage of the search produces a source with the same element type as
11514 // the original, but with a total width matching the BUILD_VECTOR output.
11515 EVT EltVT = SrcVT.getVectorElementType();
11516 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11517 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11518
11519 if (SrcVTSize.getFixedValue() < VTSize) {
11520 assert(2 * SrcVTSize == VTSize);
11521 // We can pad out the smaller vector for free, so if it's part of a
11522 // shuffle...
11523 Src.ShuffleVec =
11524 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11525 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11526 continue;
11527 }
11528
11529 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11530 LLVM_DEBUG(
11531 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11532 return SDValue();
11533 }
11534
11535 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11536 LLVM_DEBUG(
11537 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11538 return SDValue();
11539 }
11540
11541 if (Src.MinElt >= NumSrcElts) {
11542 // The extraction can just take the second half
11543 Src.ShuffleVec =
11544 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11545 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11546 Src.WindowBase = -NumSrcElts;
11547 } else if (Src.MaxElt < NumSrcElts) {
11548 // The extraction can just take the first half
11549 Src.ShuffleVec =
11550 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11551 DAG.getConstant(0, dl, MVT::i64));
11552 } else {
11553 // An actual VEXT is needed
11554 SDValue VEXTSrc1 =
11555 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11556 DAG.getConstant(0, dl, MVT::i64));
11557 SDValue VEXTSrc2 =
11558 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11559 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11560 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11561
11562 if (!SrcVT.is64BitVector()) {
11563 LLVM_DEBUG(
11564 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11565 "for SVE vectors.");
11566 return SDValue();
11567 }
11568
11569 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11570 VEXTSrc2,
11571 DAG.getConstant(Imm, dl, MVT::i32));
11572 Src.WindowBase = -Src.MinElt;
11573 }
11574 }
11575
11576 // Another possible incompatibility occurs from the vector element types. We
11577 // can fix this by bitcasting the source vectors to the same type we intend
11578 // for the shuffle.
11579 for (auto &Src : Sources) {
11580 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11581 if (SrcEltTy == SmallestEltTy)
11582 continue;
11583 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11584 if (DAG.getDataLayout().isBigEndian()) {
11585 Src.ShuffleVec =
11586 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11587 } else {
11588 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11589 }
11590 Src.WindowScale =
11591 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11592 Src.WindowBase *= Src.WindowScale;
11593 }
11594
11595 // Final check before we try to actually produce a shuffle.
11596 LLVM_DEBUG(for (auto Src
11597 : Sources)
11598 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11599
11600 // The stars all align, our next step is to produce the mask for the shuffle.
11601 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11602 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11603 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11604 SDValue Entry = Op.getOperand(i);
11605 if (Entry.isUndef())
11606 continue;
11607
11608 auto Src = find(Sources, Entry.getOperand(0));
11609 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11610
11611 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11612 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11613 // segment.
11614 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11615 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11616 VT.getScalarSizeInBits());
11617 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11618
11619 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11620 // starting at the appropriate offset.
11621 int *LaneMask = &Mask[i * ResMultiplier];
11622
11623 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11624 ExtractBase += NumElts * (Src - Sources.begin());
11625 for (int j = 0; j < LanesDefined; ++j)
11626 LaneMask[j] = ExtractBase + j;
11627 }
11628
11629 // Final check before we try to produce nonsense...
11630 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11631 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11632 return SDValue();
11633 }
11634
11635 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11636 for (unsigned i = 0; i < Sources.size(); ++i)
11637 ShuffleOps[i] = Sources[i].ShuffleVec;
11638
11639 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11640 ShuffleOps[1], Mask);
11641 SDValue V;
11642 if (DAG.getDataLayout().isBigEndian()) {
11643 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11644 } else {
11645 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11646 }
11647
11648 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11649 dbgs() << "Reshuffle, creating node: "; V.dump(););
11650
11651 return V;
11652}
11653
11654// check if an EXT instruction can handle the shuffle mask when the
11655// vector sources of the shuffle are the same.
11656static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11657 unsigned NumElts = VT.getVectorNumElements();
11658
11659 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11660 if (M[0] < 0)
11661 return false;
11662
11663 Imm = M[0];
11664
11665 // If this is a VEXT shuffle, the immediate value is the index of the first
11666 // element. The other shuffle indices must be the successive elements after
11667 // the first one.
11668 unsigned ExpectedElt = Imm;
11669 for (unsigned i = 1; i < NumElts; ++i) {
11670 // Increment the expected index. If it wraps around, just follow it
11671 // back to index zero and keep going.
11672 ++ExpectedElt;
11673 if (ExpectedElt == NumElts)
11674 ExpectedElt = 0;
11675
11676 if (M[i] < 0)
11677 continue; // ignore UNDEF indices
11678 if (ExpectedElt != static_cast<unsigned>(M[i]))
11679 return false;
11680 }
11681
11682 return true;
11683}
11684
11685// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11686// v4i32s. This is really a truncate, which we can construct out of (legal)
11687// concats and truncate nodes.
11689 if (V.getValueType() != MVT::v16i8)
11690 return SDValue();
11691 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11692
11693 for (unsigned X = 0; X < 4; X++) {
11694 // Check the first item in each group is an extract from lane 0 of a v4i32
11695 // or v4i16.
11696 SDValue BaseExt = V.getOperand(X * 4);
11697 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11698 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11699 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11700 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11701 BaseExt.getConstantOperandVal(1) != 0)
11702 return SDValue();
11703 SDValue Base = BaseExt.getOperand(0);
11704 // And check the other items are extracts from the same vector.
11705 for (unsigned Y = 1; Y < 4; Y++) {
11706 SDValue Ext = V.getOperand(X * 4 + Y);
11707 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11708 Ext.getOperand(0) != Base ||
11709 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11710 Ext.getConstantOperandVal(1) != Y)
11711 return SDValue();
11712 }
11713 }
11714
11715 // Turn the buildvector into a series of truncates and concates, which will
11716 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11717 // concat together to produce 2 v8i16. These are both truncated and concat
11718 // together.
11719 SDLoc DL(V);
11720 SDValue Trunc[4] = {
11721 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11722 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11723 for (SDValue &V : Trunc)
11724 if (V.getValueType() == MVT::v4i32)
11725 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11726 SDValue Concat0 =
11727 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11728 SDValue Concat1 =
11729 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11730 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11731 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11732 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11733}
11734
11735/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11736/// element width than the vector lane type. If that is the case the function
11737/// returns true and writes the value of the DUP instruction lane operand into
11738/// DupLaneOp
11739static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11740 unsigned &DupLaneOp) {
11741 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11742 "Only possible block sizes for wide DUP are: 16, 32, 64");
11743
11744 if (BlockSize <= VT.getScalarSizeInBits())
11745 return false;
11746 if (BlockSize % VT.getScalarSizeInBits() != 0)
11747 return false;
11748 if (VT.getSizeInBits() % BlockSize != 0)
11749 return false;
11750
11751 size_t SingleVecNumElements = VT.getVectorNumElements();
11752 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11753 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11754
11755 // We are looking for masks like
11756 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11757 // might be replaced by 'undefined'. BlockIndices will eventually contain
11758 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11759 // for the above examples)
11760 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11761 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11762 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11763 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11764 if (Elt < 0)
11765 continue;
11766 // For now we don't support shuffles that use the second operand
11767 if ((unsigned)Elt >= SingleVecNumElements)
11768 return false;
11769 if (BlockElts[I] < 0)
11770 BlockElts[I] = Elt;
11771 else if (BlockElts[I] != Elt)
11772 return false;
11773 }
11774
11775 // We found a candidate block (possibly with some undefs). It must be a
11776 // sequence of consecutive integers starting with a value divisible by
11777 // NumEltsPerBlock with some values possibly replaced by undef-s.
11778
11779 // Find first non-undef element
11780 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11781 assert(FirstRealEltIter != BlockElts.end() &&
11782 "Shuffle with all-undefs must have been caught by previous cases, "
11783 "e.g. isSplat()");
11784 if (FirstRealEltIter == BlockElts.end()) {
11785 DupLaneOp = 0;
11786 return true;
11787 }
11788
11789 // Index of FirstRealElt in BlockElts
11790 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11791
11792 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11793 return false;
11794 // BlockElts[0] must have the following value if it isn't undef:
11795 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11796
11797 // Check the first element
11798 if (Elt0 % NumEltsPerBlock != 0)
11799 return false;
11800 // Check that the sequence indeed consists of consecutive integers (modulo
11801 // undefs)
11802 for (size_t I = 0; I < NumEltsPerBlock; I++)
11803 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11804 return false;
11805
11806 DupLaneOp = Elt0 / NumEltsPerBlock;
11807 return true;
11808}
11809
11810// check if an EXT instruction can handle the shuffle mask when the
11811// vector sources of the shuffle are different.
11812static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11813 unsigned &Imm) {
11814 // Look for the first non-undef element.
11815 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11816
11817 // Benefit form APInt to handle overflow when calculating expected element.
11818 unsigned NumElts = VT.getVectorNumElements();
11819 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11820 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11821 // The following shuffle indices must be the successive elements after the
11822 // first real element.
11823 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11824 return Elt != ExpectedElt++ && Elt != -1;
11825 });
11826 if (FoundWrongElt)
11827 return false;
11828
11829 // The index of an EXT is the first element if it is not UNDEF.
11830 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11831 // value of the first element. E.g.
11832 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11833 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11834 // ExpectedElt is the last mask index plus 1.
11835 Imm = ExpectedElt.getZExtValue();
11836
11837 // There are two difference cases requiring to reverse input vectors.
11838 // For example, for vector <4 x i32> we have the following cases,
11839 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11840 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11841 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11842 // to reverse two input vectors.
11843 if (Imm < NumElts)
11844 ReverseEXT = true;
11845 else
11846 Imm -= NumElts;
11847
11848 return true;
11849}
11850
11851/// isREVMask - Check if a vector shuffle corresponds to a REV
11852/// instruction with the specified blocksize. (The order of the elements
11853/// within each block of the vector is reversed.)
11854static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11855 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11856 BlockSize == 128) &&
11857 "Only possible block sizes for REV are: 16, 32, 64, 128");
11858
11859 unsigned EltSz = VT.getScalarSizeInBits();
11860 unsigned NumElts = VT.getVectorNumElements();
11861 unsigned BlockElts = M[0] + 1;
11862 // If the first shuffle index is UNDEF, be optimistic.
11863 if (M[0] < 0)
11864 BlockElts = BlockSize / EltSz;
11865
11866 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11867 return false;
11868
11869 for (unsigned i = 0; i < NumElts; ++i) {
11870 if (M[i] < 0)
11871 continue; // ignore UNDEF indices
11872 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11873 return false;
11874 }
11875
11876 return true;
11877}
11878
11879static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11880 unsigned NumElts = VT.getVectorNumElements();
11881 if (NumElts % 2 != 0)
11882 return false;
11883 WhichResult = (M[0] == 0 ? 0 : 1);
11884 unsigned Idx = WhichResult * NumElts / 2;
11885 for (unsigned i = 0; i != NumElts; i += 2) {
11886 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11887 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
11888 return false;
11889 Idx += 1;
11890 }
11891
11892 return true;
11893}
11894
11895static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11896 unsigned NumElts = VT.getVectorNumElements();
11897 WhichResult = (M[0] == 0 ? 0 : 1);
11898 for (unsigned i = 0; i != NumElts; ++i) {
11899 if (M[i] < 0)
11900 continue; // ignore UNDEF indices
11901 if ((unsigned)M[i] != 2 * i + WhichResult)
11902 return false;
11903 }
11904
11905 return true;
11906}
11907
11908static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11909 unsigned NumElts = VT.getVectorNumElements();
11910 if (NumElts % 2 != 0)
11911 return false;
11912 WhichResult = (M[0] == 0 ? 0 : 1);
11913 for (unsigned i = 0; i < NumElts; i += 2) {
11914 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11915 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11916 return false;
11917 }
11918 return true;
11919}
11920
11921/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11922/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11923/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11924static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11925 unsigned NumElts = VT.getVectorNumElements();
11926 if (NumElts % 2 != 0)
11927 return false;
11928 WhichResult = (M[0] == 0 ? 0 : 1);
11929 unsigned Idx = WhichResult * NumElts / 2;
11930 for (unsigned i = 0; i != NumElts; i += 2) {
11931 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11932 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11933 return false;
11934 Idx += 1;
11935 }
11936
11937 return true;
11938}
11939
11940/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11941/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11942/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11943static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11944 unsigned Half = VT.getVectorNumElements() / 2;
11945 WhichResult = (M[0] == 0 ? 0 : 1);
11946 for (unsigned j = 0; j != 2; ++j) {
11947 unsigned Idx = WhichResult;
11948 for (unsigned i = 0; i != Half; ++i) {
11949 int MIdx = M[i + j * Half];
11950 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11951 return false;
11952 Idx += 2;
11953 }
11954 }
11955
11956 return true;
11957}
11958
11959/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11960/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11961/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11962static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11963 unsigned NumElts = VT.getVectorNumElements();
11964 if (NumElts % 2 != 0)
11965 return false;
11966 WhichResult = (M[0] == 0 ? 0 : 1);
11967 for (unsigned i = 0; i < NumElts; i += 2) {
11968 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11969 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11970 return false;
11971 }
11972 return true;
11973}
11974
11975static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11976 bool &DstIsLeft, int &Anomaly) {
11977 if (M.size() != static_cast<size_t>(NumInputElements))
11978 return false;
11979
11980 int NumLHSMatch = 0, NumRHSMatch = 0;
11981 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11982
11983 for (int i = 0; i < NumInputElements; ++i) {
11984 if (M[i] == -1) {
11985 ++NumLHSMatch;
11986 ++NumRHSMatch;
11987 continue;
11988 }
11989
11990 if (M[i] == i)
11991 ++NumLHSMatch;
11992 else
11993 LastLHSMismatch = i;
11994
11995 if (M[i] == i + NumInputElements)
11996 ++NumRHSMatch;
11997 else
11998 LastRHSMismatch = i;
11999 }
12000
12001 if (NumLHSMatch == NumInputElements - 1) {
12002 DstIsLeft = true;
12003 Anomaly = LastLHSMismatch;
12004 return true;
12005 } else if (NumRHSMatch == NumInputElements - 1) {
12006 DstIsLeft = false;
12007 Anomaly = LastRHSMismatch;
12008 return true;
12009 }
12010
12011 return false;
12012}
12013
12014static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12015 if (VT.getSizeInBits() != 128)
12016 return false;
12017
12018 unsigned NumElts = VT.getVectorNumElements();
12019
12020 for (int I = 0, E = NumElts / 2; I != E; I++) {
12021 if (Mask[I] != I)
12022 return false;
12023 }
12024
12025 int Offset = NumElts / 2;
12026 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12027 if (Mask[I] != I + SplitLHS * Offset)
12028 return false;
12029 }
12030
12031 return true;
12032}
12033
12035 SDLoc DL(Op);
12036 EVT VT = Op.getValueType();
12037 SDValue V0 = Op.getOperand(0);
12038 SDValue V1 = Op.getOperand(1);
12039 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12040
12043 return SDValue();
12044
12045 bool SplitV0 = V0.getValueSizeInBits() == 128;
12046
12047 if (!isConcatMask(Mask, VT, SplitV0))
12048 return SDValue();
12049
12050 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12051 if (SplitV0) {
12052 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12053 DAG.getConstant(0, DL, MVT::i64));
12054 }
12055 if (V1.getValueSizeInBits() == 128) {
12056 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12057 DAG.getConstant(0, DL, MVT::i64));
12058 }
12059 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12060}
12061
12062/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12063/// the specified operations to build the shuffle. ID is the perfect-shuffle
12064//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12065//table entry and LHS/RHS are the immediate inputs for this stage of the
12066//shuffle.
12068 SDValue V2, unsigned PFEntry, SDValue LHS,
12069 SDValue RHS, SelectionDAG &DAG,
12070 const SDLoc &dl) {
12071 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12072 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12073 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12074
12075 enum {
12076 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12077 OP_VREV,
12078 OP_VDUP0,
12079 OP_VDUP1,
12080 OP_VDUP2,
12081 OP_VDUP3,
12082 OP_VEXT1,
12083 OP_VEXT2,
12084 OP_VEXT3,
12085 OP_VUZPL, // VUZP, left result
12086 OP_VUZPR, // VUZP, right result
12087 OP_VZIPL, // VZIP, left result
12088 OP_VZIPR, // VZIP, right result
12089 OP_VTRNL, // VTRN, left result
12090 OP_VTRNR, // VTRN, right result
12091 OP_MOVLANE // Move lane. RHSID is the lane to move into
12092 };
12093
12094 if (OpNum == OP_COPY) {
12095 if (LHSID == (1 * 9 + 2) * 9 + 3)
12096 return LHS;
12097 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12098 return RHS;
12099 }
12100
12101 if (OpNum == OP_MOVLANE) {
12102 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12103 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12104 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12105 Elt = 3 - Elt;
12106 while (Elt > 0) {
12107 ID /= 9;
12108 Elt--;
12109 }
12110 return (ID % 9 == 8) ? -1 : ID % 9;
12111 };
12112
12113 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12114 // get the lane to move from the PFID, which is always from the
12115 // original vectors (V1 or V2).
12117 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12118 EVT VT = OpLHS.getValueType();
12119 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12120 unsigned ExtLane = 0;
12121 SDValue Input;
12122
12123 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12124 // convert into a higher type.
12125 if (RHSID & 0x4) {
12126 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12127 if (MaskElt == -1)
12128 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12129 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12130 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12131 Input = MaskElt < 2 ? V1 : V2;
12132 if (VT.getScalarSizeInBits() == 16) {
12133 Input = DAG.getBitcast(MVT::v2f32, Input);
12134 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12135 } else {
12136 assert(VT.getScalarSizeInBits() == 32 &&
12137 "Expected 16 or 32 bit shuffle elemements");
12138 Input = DAG.getBitcast(MVT::v2f64, Input);
12139 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12140 }
12141 } else {
12142 int MaskElt = getPFIDLane(ID, RHSID);
12143 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12144 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12145 Input = MaskElt < 4 ? V1 : V2;
12146 // Be careful about creating illegal types. Use f16 instead of i16.
12147 if (VT == MVT::v4i16) {
12148 Input = DAG.getBitcast(MVT::v4f16, Input);
12149 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12150 }
12151 }
12154 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12155 SDValue Ins =
12156 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12157 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12158 return DAG.getBitcast(VT, Ins);
12159 }
12160
12161 SDValue OpLHS, OpRHS;
12162 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12163 RHS, DAG, dl);
12164 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12165 RHS, DAG, dl);
12166 EVT VT = OpLHS.getValueType();
12167
12168 switch (OpNum) {
12169 default:
12170 llvm_unreachable("Unknown shuffle opcode!");
12171 case OP_VREV:
12172 // VREV divides the vector in half and swaps within the half.
12173 if (VT.getVectorElementType() == MVT::i32 ||
12174 VT.getVectorElementType() == MVT::f32)
12175 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12176 // vrev <4 x i16> -> REV32
12177 if (VT.getVectorElementType() == MVT::i16 ||
12178 VT.getVectorElementType() == MVT::f16 ||
12179 VT.getVectorElementType() == MVT::bf16)
12180 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12181 // vrev <4 x i8> -> REV16
12182 assert(VT.getVectorElementType() == MVT::i8);
12183 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12184 case OP_VDUP0:
12185 case OP_VDUP1:
12186 case OP_VDUP2:
12187 case OP_VDUP3: {
12188 EVT EltTy = VT.getVectorElementType();
12189 unsigned Opcode;
12190 if (EltTy == MVT::i8)
12191 Opcode = AArch64ISD::DUPLANE8;
12192 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12193 Opcode = AArch64ISD::DUPLANE16;
12194 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12195 Opcode = AArch64ISD::DUPLANE32;
12196 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12197 Opcode = AArch64ISD::DUPLANE64;
12198 else
12199 llvm_unreachable("Invalid vector element type?");
12200
12201 if (VT.getSizeInBits() == 64)
12202 OpLHS = WidenVector(OpLHS, DAG);
12203 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12204 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12205 }
12206 case OP_VEXT1:
12207 case OP_VEXT2:
12208 case OP_VEXT3: {
12209 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12210 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12211 DAG.getConstant(Imm, dl, MVT::i32));
12212 }
12213 case OP_VUZPL:
12214 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12215 case OP_VUZPR:
12216 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12217 case OP_VZIPL:
12218 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12219 case OP_VZIPR:
12220 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12221 case OP_VTRNL:
12222 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12223 case OP_VTRNR:
12224 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12225 }
12226}
12227
12229 SelectionDAG &DAG) {
12230 // Check to see if we can use the TBL instruction.
12231 SDValue V1 = Op.getOperand(0);
12232 SDValue V2 = Op.getOperand(1);
12233 SDLoc DL(Op);
12234
12235 EVT EltVT = Op.getValueType().getVectorElementType();
12236 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12237
12238 bool Swap = false;
12239 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12240 std::swap(V1, V2);
12241 Swap = true;
12242 }
12243
12244 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12245 // out of range values with 0s. We do need to make sure that any out-of-range
12246 // values are really out-of-range for a v16i8 vector.
12247 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12248 MVT IndexVT = MVT::v8i8;
12249 unsigned IndexLen = 8;
12250 if (Op.getValueSizeInBits() == 128) {
12251 IndexVT = MVT::v16i8;
12252 IndexLen = 16;
12253 }
12254
12256 for (int Val : ShuffleMask) {
12257 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12258 unsigned Offset = Byte + Val * BytesPerElt;
12259 if (Swap)
12260 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12261 if (IsUndefOrZero && Offset >= IndexLen)
12262 Offset = 255;
12263 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12264 }
12265 }
12266
12267 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12268 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12269
12270 SDValue Shuffle;
12271 if (IsUndefOrZero) {
12272 if (IndexLen == 8)
12273 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12274 Shuffle = DAG.getNode(
12275 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12276 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12277 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12278 } else {
12279 if (IndexLen == 8) {
12280 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12281 Shuffle = DAG.getNode(
12282 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12283 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12284 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12285 } else {
12286 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12287 // cannot currently represent the register constraints on the input
12288 // table registers.
12289 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12290 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12291 // IndexLen));
12292 Shuffle = DAG.getNode(
12293 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12294 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12295 V2Cst,
12296 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12297 }
12298 }
12299 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12300}
12301
12302static unsigned getDUPLANEOp(EVT EltType) {
12303 if (EltType == MVT::i8)
12304 return AArch64ISD::DUPLANE8;
12305 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12306 return AArch64ISD::DUPLANE16;
12307 if (EltType == MVT::i32 || EltType == MVT::f32)
12308 return AArch64ISD::DUPLANE32;
12309 if (EltType == MVT::i64 || EltType == MVT::f64)
12310 return AArch64ISD::DUPLANE64;
12311
12312 llvm_unreachable("Invalid vector element type?");
12313}
12314
12315static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12316 unsigned Opcode, SelectionDAG &DAG) {
12317 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12318 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12319 // Match: dup (bitcast (extract_subv X, C)), LaneC
12320 if (BitCast.getOpcode() != ISD::BITCAST ||
12322 return false;
12323
12324 // The extract index must align in the destination type. That may not
12325 // happen if the bitcast is from narrow to wide type.
12326 SDValue Extract = BitCast.getOperand(0);
12327 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12328 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12329 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12330 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12331 if (ExtIdxInBits % CastedEltBitWidth != 0)
12332 return false;
12333
12334 // Can't handle cases where vector size is not 128-bit
12335 if (!Extract.getOperand(0).getValueType().is128BitVector())
12336 return false;
12337
12338 // Update the lane value by offsetting with the scaled extract index.
12339 LaneC += ExtIdxInBits / CastedEltBitWidth;
12340
12341 // Determine the casted vector type of the wide vector input.
12342 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12343 // Examples:
12344 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12345 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12346 unsigned SrcVecNumElts =
12347 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12349 SrcVecNumElts);
12350 return true;
12351 };
12352 MVT CastVT;
12353 if (getScaledOffsetDup(V, Lane, CastVT)) {
12354 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12355 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12356 V.getOperand(0).getValueType().is128BitVector()) {
12357 // The lane is incremented by the index of the extract.
12358 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12359 Lane += V.getConstantOperandVal(1);
12360 V = V.getOperand(0);
12361 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12362 // The lane is decremented if we are splatting from the 2nd operand.
12363 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12364 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12365 Lane -= Idx * VT.getVectorNumElements() / 2;
12366 V = WidenVector(V.getOperand(Idx), DAG);
12367 } else if (VT.getSizeInBits() == 64) {
12368 // Widen the operand to 128-bit register with undef.
12369 V = WidenVector(V, DAG);
12370 }
12371 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12372}
12373
12374// Return true if we can get a new shuffle mask by checking the parameter mask
12375// array to test whether every two adjacent mask values are continuous and
12376// starting from an even number.
12378 SmallVectorImpl<int> &NewMask) {
12379 unsigned NumElts = VT.getVectorNumElements();
12380 if (NumElts % 2 != 0)
12381 return false;
12382
12383 NewMask.clear();
12384 for (unsigned i = 0; i < NumElts; i += 2) {
12385 int M0 = M[i];
12386 int M1 = M[i + 1];
12387
12388 // If both elements are undef, new mask is undef too.
12389 if (M0 == -1 && M1 == -1) {
12390 NewMask.push_back(-1);
12391 continue;
12392 }
12393
12394 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12395 NewMask.push_back(M1 / 2);
12396 continue;
12397 }
12398
12399 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12400 NewMask.push_back(M0 / 2);
12401 continue;
12402 }
12403
12404 NewMask.clear();
12405 return false;
12406 }
12407
12408 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12409 return true;
12410}
12411
12412// Try to widen element type to get a new mask value for a better permutation
12413// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12414// UZP1/2, TRN1/2, REV, INS, etc.
12415// For example:
12416// shufflevector <4 x i32> %a, <4 x i32> %b,
12417// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12418// is equivalent to:
12419// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12420// Finally, we can get:
12421// mov v0.d[0], v1.d[1]
12423 SDLoc DL(Op);
12424 EVT VT = Op.getValueType();
12425 EVT ScalarVT = VT.getVectorElementType();
12426 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12427 SDValue V0 = Op.getOperand(0);
12428 SDValue V1 = Op.getOperand(1);
12429 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12430
12431 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12432 // We need to make sure the wider element type is legal. Thus, ElementSize
12433 // should be not larger than 32 bits, and i1 type should also be excluded.
12434 if (ElementSize > 32 || ElementSize == 1)
12435 return SDValue();
12436
12437 SmallVector<int, 8> NewMask;
12438 if (isWideTypeMask(Mask, VT, NewMask)) {
12439 MVT NewEltVT = VT.isFloatingPoint()
12440 ? MVT::getFloatingPointVT(ElementSize * 2)
12441 : MVT::getIntegerVT(ElementSize * 2);
12442 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12443 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12444 V0 = DAG.getBitcast(NewVT, V0);
12445 V1 = DAG.getBitcast(NewVT, V1);
12446 return DAG.getBitcast(VT,
12447 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12448 }
12449 }
12450
12451 return SDValue();
12452}
12453
12454// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12456 ArrayRef<int> ShuffleMask,
12457 SelectionDAG &DAG) {
12458 SDValue Tbl1 = Op->getOperand(0);
12459 SDValue Tbl2 = Op->getOperand(1);
12460 SDLoc dl(Op);
12461 SDValue Tbl2ID =
12462 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12463
12464 EVT VT = Op.getValueType();
12465 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12466 Tbl1->getOperand(0) != Tbl2ID ||
12468 Tbl2->getOperand(0) != Tbl2ID)
12469 return SDValue();
12470
12471 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12472 Tbl2->getValueType(0) != MVT::v16i8)
12473 return SDValue();
12474
12475 SDValue Mask1 = Tbl1->getOperand(3);
12476 SDValue Mask2 = Tbl2->getOperand(3);
12477 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12478 for (unsigned I = 0; I < 16; I++) {
12479 if (ShuffleMask[I] < 16)
12480 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12481 else {
12482 auto *C =
12483 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12484 if (!C)
12485 return SDValue();
12486 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12487 }
12488 }
12489
12490 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12491 SDValue ID =
12492 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12493
12494 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12495 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12496 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12497}
12498
12499// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12500// but we don't have an appropriate instruction,
12501// so custom-lower it as ZIP1-with-zeros.
12502SDValue
12503AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12504 SelectionDAG &DAG) const {
12505 SDLoc dl(Op);
12506 EVT VT = Op.getValueType();
12507 SDValue SrcOp = Op.getOperand(0);
12508 EVT SrcVT = SrcOp.getValueType();
12509 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12510 "Unexpected extension factor.");
12511 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12512 // FIXME: support multi-step zipping?
12513 if (Scale != 2)
12514 return SDValue();
12515 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12516 return DAG.getBitcast(VT,
12517 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12518}
12519
12520SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12521 SelectionDAG &DAG) const {
12522 SDLoc dl(Op);
12523 EVT VT = Op.getValueType();
12524
12525 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12526
12527 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12528 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12529
12530 // Convert shuffles that are directly supported on NEON to target-specific
12531 // DAG nodes, instead of keeping them as shuffles and matching them again
12532 // during code selection. This is more efficient and avoids the possibility
12533 // of inconsistencies between legalization and selection.
12534 ArrayRef<int> ShuffleMask = SVN->getMask();
12535
12536 SDValue V1 = Op.getOperand(0);
12537 SDValue V2 = Op.getOperand(1);
12538
12539 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12540 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12541 "Unexpected VECTOR_SHUFFLE mask size!");
12542
12543 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12544 return Res;
12545
12546 if (SVN->isSplat()) {
12547 int Lane = SVN->getSplatIndex();
12548 // If this is undef splat, generate it via "just" vdup, if possible.
12549 if (Lane == -1)
12550 Lane = 0;
12551
12552 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12553 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12554 V1.getOperand(0));
12555 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12556 // constant. If so, we can just reference the lane's definition directly.
12557 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12558 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12559 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12560
12561 // Otherwise, duplicate from the lane of the input vector.
12562 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12563 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12564 }
12565
12566 // Check if the mask matches a DUP for a wider element
12567 for (unsigned LaneSize : {64U, 32U, 16U}) {
12568 unsigned Lane = 0;
12569 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12570 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12571 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12573 // Cast V1 to an integer vector with required lane size
12574 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12575 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12576 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12577 V1 = DAG.getBitcast(NewVecTy, V1);
12578 // Constuct the DUP instruction
12579 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12580 // Cast back to the original type
12581 return DAG.getBitcast(VT, V1);
12582 }
12583 }
12584
12585 if (isREVMask(ShuffleMask, VT, 64))
12586 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12587 if (isREVMask(ShuffleMask, VT, 32))
12588 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12589 if (isREVMask(ShuffleMask, VT, 16))
12590 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12591
12592 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12593 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12594 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12595 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12596 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12597 DAG.getConstant(8, dl, MVT::i32));
12598 }
12599
12600 bool ReverseEXT = false;
12601 unsigned Imm;
12602 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12603 if (ReverseEXT)
12604 std::swap(V1, V2);
12605 Imm *= getExtFactor(V1);
12606 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12607 DAG.getConstant(Imm, dl, MVT::i32));
12608 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12609 Imm *= getExtFactor(V1);
12610 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12611 DAG.getConstant(Imm, dl, MVT::i32));
12612 }
12613
12614 unsigned WhichResult;
12615 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12616 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12617 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12618 }
12619 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12620 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12621 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12622 }
12623 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12624 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12625 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12626 }
12627
12628 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12629 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12630 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12631 }
12632 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12633 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12634 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12635 }
12636 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12637 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12638 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12639 }
12640
12642 return Concat;
12643
12644 bool DstIsLeft;
12645 int Anomaly;
12646 int NumInputElements = V1.getValueType().getVectorNumElements();
12647 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12648 SDValue DstVec = DstIsLeft ? V1 : V2;
12649 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12650
12651 SDValue SrcVec = V1;
12652 int SrcLane = ShuffleMask[Anomaly];
12653 if (SrcLane >= NumInputElements) {
12654 SrcVec = V2;
12655 SrcLane -= VT.getVectorNumElements();
12656 }
12657 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12658
12659 EVT ScalarVT = VT.getVectorElementType();
12660
12661 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12662 ScalarVT = MVT::i32;
12663
12664 return DAG.getNode(
12665 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12666 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12667 DstLaneV);
12668 }
12669
12670 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12671 return NewSD;
12672
12673 // If the shuffle is not directly supported and it has 4 elements, use
12674 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12675 unsigned NumElts = VT.getVectorNumElements();
12676 if (NumElts == 4) {
12677 unsigned PFIndexes[4];
12678 for (unsigned i = 0; i != 4; ++i) {
12679 if (ShuffleMask[i] < 0)
12680 PFIndexes[i] = 8;
12681 else
12682 PFIndexes[i] = ShuffleMask[i];
12683 }
12684
12685 // Compute the index in the perfect shuffle table.
12686 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12687 PFIndexes[2] * 9 + PFIndexes[3];
12688 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12689 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12690 dl);
12691 }
12692
12693 return GenerateTBL(Op, ShuffleMask, DAG);
12694}
12695
12696SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12697 SelectionDAG &DAG) const {
12698 EVT VT = Op.getValueType();
12699
12700 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12701 return LowerToScalableOp(Op, DAG);
12702
12703 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12704 "Unexpected vector type!");
12705
12706 // We can handle the constant cases during isel.
12707 if (isa<ConstantSDNode>(Op.getOperand(0)))
12708 return Op;
12709
12710 // There isn't a natural way to handle the general i1 case, so we use some
12711 // trickery with whilelo.
12712 SDLoc DL(Op);
12713 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12714 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12715 DAG.getValueType(MVT::i1));
12716 SDValue ID =
12717 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12718 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12719 if (VT == MVT::nxv1i1)
12720 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12721 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12722 Zero, SplatVal),
12723 Zero);
12724 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12725}
12726
12727SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12728 SelectionDAG &DAG) const {
12729 SDLoc DL(Op);
12730
12731 EVT VT = Op.getValueType();
12732 if (!isTypeLegal(VT) || !VT.isScalableVector())
12733 return SDValue();
12734
12735 // Current lowering only supports the SVE-ACLE types.
12737 return SDValue();
12738
12739 // The DUPQ operation is indepedent of element type so normalise to i64s.
12740 SDValue Idx128 = Op.getOperand(2);
12741
12742 // DUPQ can be used when idx is in range.
12743 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12744 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12745 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12746 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12747 }
12748
12749 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12750
12751 // The ACLE says this must produce the same result as:
12752 // svtbl(data, svadd_x(svptrue_b64(),
12753 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12754 // index * 2))
12755 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12756 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12757
12758 // create the vector 0,1,0,1,...
12759 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12760 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12761
12762 // create the vector idx64,idx64+1,idx64,idx64+1,...
12763 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12764 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12765 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12766
12767 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12768 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12769 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12770}
12771
12772
12773static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12774 APInt &UndefBits) {
12775 EVT VT = BVN->getValueType(0);
12776 APInt SplatBits, SplatUndef;
12777 unsigned SplatBitSize;
12778 bool HasAnyUndefs;
12779 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12780 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12781
12782 for (unsigned i = 0; i < NumSplats; ++i) {
12783 CnstBits <<= SplatBitSize;
12784 UndefBits <<= SplatBitSize;
12785 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12786 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12787 }
12788
12789 return true;
12790 }
12791
12792 return false;
12793}
12794
12795// Try 64-bit splatted SIMD immediate.
12796static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12797 const APInt &Bits) {
12798 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12799 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12800 EVT VT = Op.getValueType();
12801 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12802
12805
12806 SDLoc dl(Op);
12807 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12808 DAG.getConstant(Value, dl, MVT::i32));
12809 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12810 }
12811 }
12812
12813 return SDValue();
12814}
12815
12816// Try 32-bit splatted SIMD immediate.
12817static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12818 const APInt &Bits,
12819 const SDValue *LHS = nullptr) {
12820 EVT VT = Op.getValueType();
12821 if (VT.isFixedLengthVector() &&
12823 return SDValue();
12824
12825 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12826 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12827 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12828 bool isAdvSIMDModImm = false;
12829 uint64_t Shift;
12830
12831 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12833 Shift = 0;
12834 }
12835 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12837 Shift = 8;
12838 }
12839 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12841 Shift = 16;
12842 }
12843 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12845 Shift = 24;
12846 }
12847
12848 if (isAdvSIMDModImm) {
12849 SDLoc dl(Op);
12850 SDValue Mov;
12851
12852 if (LHS)
12853 Mov = DAG.getNode(NewOp, dl, MovTy,
12854 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12855 DAG.getConstant(Value, dl, MVT::i32),
12856 DAG.getConstant(Shift, dl, MVT::i32));
12857 else
12858 Mov = DAG.getNode(NewOp, dl, MovTy,
12859 DAG.getConstant(Value, dl, MVT::i32),
12860 DAG.getConstant(Shift, dl, MVT::i32));
12861
12862 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12863 }
12864 }
12865
12866 return SDValue();
12867}
12868
12869// Try 16-bit splatted SIMD immediate.
12870static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12871 const APInt &Bits,
12872 const SDValue *LHS = nullptr) {
12873 EVT VT = Op.getValueType();
12874 if (VT.isFixedLengthVector() &&
12876 return SDValue();
12877
12878 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12879 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12880 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12881 bool isAdvSIMDModImm = false;
12882 uint64_t Shift;
12883
12884 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12886 Shift = 0;
12887 }
12888 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12890 Shift = 8;
12891 }
12892
12893 if (isAdvSIMDModImm) {
12894 SDLoc dl(Op);
12895 SDValue Mov;
12896
12897 if (LHS)
12898 Mov = DAG.getNode(NewOp, dl, MovTy,
12899 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12900 DAG.getConstant(Value, dl, MVT::i32),
12901 DAG.getConstant(Shift, dl, MVT::i32));
12902 else
12903 Mov = DAG.getNode(NewOp, dl, MovTy,
12904 DAG.getConstant(Value, dl, MVT::i32),
12905 DAG.getConstant(Shift, dl, MVT::i32));
12906
12907 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12908 }
12909 }
12910
12911 return SDValue();
12912}
12913
12914// Try 32-bit splatted SIMD immediate with shifted ones.
12916 SelectionDAG &DAG, const APInt &Bits) {
12917 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12918 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12919 EVT VT = Op.getValueType();
12920 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12921 bool isAdvSIMDModImm = false;
12922 uint64_t Shift;
12923
12924 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12926 Shift = 264;
12927 }
12928 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12930 Shift = 272;
12931 }
12932
12933 if (isAdvSIMDModImm) {
12934 SDLoc dl(Op);
12935 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12936 DAG.getConstant(Value, dl, MVT::i32),
12937 DAG.getConstant(Shift, dl, MVT::i32));
12938 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12939 }
12940 }
12941
12942 return SDValue();
12943}
12944
12945// Try 8-bit splatted SIMD immediate.
12946static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12947 const APInt &Bits) {
12948 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12949 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12950 EVT VT = Op.getValueType();
12951 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12952
12955
12956 SDLoc dl(Op);
12957 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12958 DAG.getConstant(Value, dl, MVT::i32));
12959 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12960 }
12961 }
12962
12963 return SDValue();
12964}
12965
12966// Try FP splatted SIMD immediate.
12967static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12968 const APInt &Bits) {
12969 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12970 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12971 EVT VT = Op.getValueType();
12972 bool isWide = (VT.getSizeInBits() == 128);
12973 MVT MovTy;
12974 bool isAdvSIMDModImm = false;
12975
12976 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12978 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12979 }
12980 else if (isWide &&
12981 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12983 MovTy = MVT::v2f64;
12984 }
12985
12986 if (isAdvSIMDModImm) {
12987 SDLoc dl(Op);
12988 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12989 DAG.getConstant(Value, dl, MVT::i32));
12990 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12991 }
12992 }
12993
12994 return SDValue();
12995}
12996
12997// Specialized code to quickly find if PotentialBVec is a BuildVector that
12998// consists of only the same constant int value, returned in reference arg
12999// ConstVal
13000static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13001 uint64_t &ConstVal) {
13002 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
13003 if (!Bvec)
13004 return false;
13005 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
13006 if (!FirstElt)
13007 return false;
13008 EVT VT = Bvec->getValueType(0);
13009 unsigned NumElts = VT.getVectorNumElements();
13010 for (unsigned i = 1; i < NumElts; ++i)
13011 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13012 return false;
13013 ConstVal = FirstElt->getZExtValue();
13014 return true;
13015}
13016
13018 // Look through cast.
13019 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13020 N = N.getOperand(0);
13021
13022 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13023}
13024
13026 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13027
13028 // Look through cast.
13029 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13030 N = N.getOperand(0);
13031 // When reinterpreting from a type with fewer elements the "new" elements
13032 // are not active, so bail if they're likely to be used.
13033 if (N.getValueType().getVectorMinNumElements() < NumElts)
13034 return false;
13035 }
13036
13037 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13038 return true;
13039
13040 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13041 // or smaller than the implicit element type represented by N.
13042 // NOTE: A larger element count implies a smaller element type.
13043 if (N.getOpcode() == AArch64ISD::PTRUE &&
13044 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13045 return N.getValueType().getVectorMinNumElements() >= NumElts;
13046
13047 // If we're compiling for a specific vector-length, we can check if the
13048 // pattern's VL equals that of the scalable vector at runtime.
13049 if (N.getOpcode() == AArch64ISD::PTRUE) {
13050 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13051 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13052 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13053 if (MaxSVESize && MinSVESize == MaxSVESize) {
13054 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13055 unsigned PatNumElts =
13056 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13057 return PatNumElts == (NumElts * VScale);
13058 }
13059 }
13060
13061 return false;
13062}
13063
13064// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13065// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13066// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13067// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13068// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13069// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13071 EVT VT = N->getValueType(0);
13072
13073 if (!VT.isVector())
13074 return SDValue();
13075
13076 SDLoc DL(N);
13077
13078 SDValue And;
13079 SDValue Shift;
13080
13081 SDValue FirstOp = N->getOperand(0);
13082 unsigned FirstOpc = FirstOp.getOpcode();
13083 SDValue SecondOp = N->getOperand(1);
13084 unsigned SecondOpc = SecondOp.getOpcode();
13085
13086 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13087 // a BICi in order to use an immediate instead of a register.
13088 // Is the other operand an shl or lshr? This will have been turned into:
13089 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13090 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13091 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13092 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13093 SecondOpc == AArch64ISD::SHL_PRED ||
13094 SecondOpc == AArch64ISD::SRL_PRED)) {
13095 And = FirstOp;
13096 Shift = SecondOp;
13097
13098 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13099 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13100 FirstOpc == AArch64ISD::SHL_PRED ||
13101 FirstOpc == AArch64ISD::SRL_PRED)) {
13102 And = SecondOp;
13103 Shift = FirstOp;
13104 } else
13105 return SDValue();
13106
13107 bool IsAnd = And.getOpcode() == ISD::AND;
13108 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13110 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13112
13113 // Is the shift amount constant and are all lanes active?
13114 uint64_t C2;
13115 if (ShiftHasPredOp) {
13116 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13117 return SDValue();
13118 APInt C;
13120 return SDValue();
13121 C2 = C.getZExtValue();
13122 } else if (ConstantSDNode *C2node =
13123 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13124 C2 = C2node->getZExtValue();
13125 else
13126 return SDValue();
13127
13128 APInt C1AsAPInt;
13129 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13130 if (IsAnd) {
13131 // Is the and mask vector all constant?
13132 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13133 return SDValue();
13134 } else {
13135 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13136 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13137 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13138 assert(C1nodeImm && C1nodeShift);
13139 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13140 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13141 }
13142
13143 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13144 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13145 // how much one can shift elements of a particular size?
13146 if (C2 > ElemSizeInBits)
13147 return SDValue();
13148
13149 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13150 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13151 if (C1AsAPInt != RequiredC1)
13152 return SDValue();
13153
13154 SDValue X = And.getOperand(0);
13155 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13156 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13157 : Shift.getOperand(1);
13158
13159 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13160 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13161
13162 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13163 LLVM_DEBUG(N->dump(&DAG));
13164 LLVM_DEBUG(dbgs() << "into: \n");
13165 LLVM_DEBUG(ResultSLI->dump(&DAG));
13166
13167 ++NumShiftInserts;
13168 return ResultSLI;
13169}
13170
13171SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13172 SelectionDAG &DAG) const {
13173 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13174 !Subtarget->isNeonAvailable()))
13175 return LowerToScalableOp(Op, DAG);
13176
13177 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13178 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13179 return Res;
13180
13181 EVT VT = Op.getValueType();
13182 if (VT.isScalableVector())
13183 return Op;
13184
13185 SDValue LHS = Op.getOperand(0);
13186 BuildVectorSDNode *BVN =
13187 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13188 if (!BVN) {
13189 // OR commutes, so try swapping the operands.
13190 LHS = Op.getOperand(1);
13191 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13192 }
13193 if (!BVN)
13194 return Op;
13195
13196 APInt DefBits(VT.getSizeInBits(), 0);
13197 APInt UndefBits(VT.getSizeInBits(), 0);
13198 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13199 SDValue NewOp;
13200
13201 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13202 DefBits, &LHS)) ||
13203 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13204 DefBits, &LHS)))
13205 return NewOp;
13206
13207 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13208 UndefBits, &LHS)) ||
13209 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13210 UndefBits, &LHS)))
13211 return NewOp;
13212 }
13213
13214 // We can always fall back to a non-immediate OR.
13215 return Op;
13216}
13217
13218// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13219// be truncated to fit element width.
13221 SelectionDAG &DAG) {
13222 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13223 SDLoc dl(Op);
13224 EVT VT = Op.getValueType();
13225 EVT EltTy= VT.getVectorElementType();
13226
13227 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13228 return Op;
13229
13231 for (SDValue Lane : Op->ops()) {
13232 // For integer vectors, type legalization would have promoted the
13233 // operands already. Otherwise, if Op is a floating-point splat
13234 // (with operands cast to integers), then the only possibilities
13235 // are constants and UNDEFs.
13236 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13237 APInt LowBits(EltTy.getSizeInBits(),
13238 CstLane->getZExtValue());
13239 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13240 } else if (Lane.getNode()->isUndef()) {
13241 Lane = DAG.getUNDEF(MVT::i32);
13242 } else {
13243 assert(Lane.getValueType() == MVT::i32 &&
13244 "Unexpected BUILD_VECTOR operand type");
13245 }
13246 Ops.push_back(Lane);
13247 }
13248 return DAG.getBuildVector(VT, dl, Ops);
13249}
13250
13252 const AArch64Subtarget *ST) {
13253 EVT VT = Op.getValueType();
13254 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13255 "Expected a legal NEON vector");
13256
13257 APInt DefBits(VT.getSizeInBits(), 0);
13258 APInt UndefBits(VT.getSizeInBits(), 0);
13259 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13260 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13261 auto TryMOVIWithBits = [&](APInt DefBits) {
13262 SDValue NewOp;
13263 if ((NewOp =
13264 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13265 (NewOp =
13266 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13267 (NewOp =
13268 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13269 (NewOp =
13270 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13271 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13272 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13273 return NewOp;
13274
13275 APInt NotDefBits = ~DefBits;
13276 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13277 NotDefBits)) ||
13279 NotDefBits)) ||
13280 (NewOp =
13281 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13282 return NewOp;
13283 return SDValue();
13284 };
13285 if (SDValue R = TryMOVIWithBits(DefBits))
13286 return R;
13287 if (SDValue R = TryMOVIWithBits(UndefBits))
13288 return R;
13289
13290 // See if a fneg of the constant can be materialized with a MOVI, etc
13291 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13292 // FNegate each sub-element of the constant
13293 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13294 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13295 .zext(VT.getSizeInBits());
13296 APInt NegBits(VT.getSizeInBits(), 0);
13297 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13298 for (unsigned i = 0; i < NumElts; i++)
13299 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13300 NegBits = DefBits ^ NegBits;
13301
13302 // Try to create the new constants with MOVI, and if so generate a fneg
13303 // for it.
13304 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13305 SDLoc DL(Op);
13306 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13307 return DAG.getNode(
13309 DAG.getNode(ISD::FNEG, DL, VFVT,
13310 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13311 }
13312 return SDValue();
13313 };
13314 SDValue R;
13315 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13316 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13317 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13318 return R;
13319 }
13320
13321 return SDValue();
13322}
13323
13324SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13325 SelectionDAG &DAG) const {
13326 EVT VT = Op.getValueType();
13327
13328 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13329 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13330 SDLoc DL(Op);
13331 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13332 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13333 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13334 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13335 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13336 }
13337
13338 // Revert to common legalisation for all other variants.
13339 return SDValue();
13340 }
13341
13342 // Try to build a simple constant vector.
13343 Op = NormalizeBuildVector(Op, DAG);
13344 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13345 // abort.
13346 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13347 return SDValue();
13348
13349 // Certain vector constants, used to express things like logical NOT and
13350 // arithmetic NEG, are passed through unmodified. This allows special
13351 // patterns for these operations to match, which will lower these constants
13352 // to whatever is proven necessary.
13353 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13354 if (BVN->isConstant()) {
13355 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13356 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13357 APInt Val(BitSize,
13358 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13359 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13360 return Op;
13361 }
13362 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13363 if (Const->isZero() && !Const->isNegative())
13364 return Op;
13365 }
13366
13367 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13368 return V;
13369
13370 // Scan through the operands to find some interesting properties we can
13371 // exploit:
13372 // 1) If only one value is used, we can use a DUP, or
13373 // 2) if only the low element is not undef, we can just insert that, or
13374 // 3) if only one constant value is used (w/ some non-constant lanes),
13375 // we can splat the constant value into the whole vector then fill
13376 // in the non-constant lanes.
13377 // 4) FIXME: If different constant values are used, but we can intelligently
13378 // select the values we'll be overwriting for the non-constant
13379 // lanes such that we can directly materialize the vector
13380 // some other way (MOVI, e.g.), we can be sneaky.
13381 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13382 SDLoc dl(Op);
13383 unsigned NumElts = VT.getVectorNumElements();
13384 bool isOnlyLowElement = true;
13385 bool usesOnlyOneValue = true;
13386 bool usesOnlyOneConstantValue = true;
13387 bool isConstant = true;
13388 bool AllLanesExtractElt = true;
13389 unsigned NumConstantLanes = 0;
13390 unsigned NumDifferentLanes = 0;
13391 unsigned NumUndefLanes = 0;
13392 SDValue Value;
13393 SDValue ConstantValue;
13394 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13395 unsigned ConsecutiveValCount = 0;
13396 SDValue PrevVal;
13397 for (unsigned i = 0; i < NumElts; ++i) {
13398 SDValue V = Op.getOperand(i);
13399 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13400 AllLanesExtractElt = false;
13401 if (V.isUndef()) {
13402 ++NumUndefLanes;
13403 continue;
13404 }
13405 if (i > 0)
13406 isOnlyLowElement = false;
13407 if (!isIntOrFPConstant(V))
13408 isConstant = false;
13409
13410 if (isIntOrFPConstant(V)) {
13411 ++NumConstantLanes;
13412 if (!ConstantValue.getNode())
13413 ConstantValue = V;
13414 else if (ConstantValue != V)
13415 usesOnlyOneConstantValue = false;
13416 }
13417
13418 if (!Value.getNode())
13419 Value = V;
13420 else if (V != Value) {
13421 usesOnlyOneValue = false;
13422 ++NumDifferentLanes;
13423 }
13424
13425 if (PrevVal != V) {
13426 ConsecutiveValCount = 0;
13427 PrevVal = V;
13428 }
13429
13430 // Keep different values and its last consecutive count. For example,
13431 //
13432 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13433 // t24, t24, t24, t24, t24, t24, t24, t24
13434 // t23 = consecutive count 8
13435 // t24 = consecutive count 8
13436 // ------------------------------------------------------------------
13437 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13438 // t24, t24, t24, t24, t24, t24, t24, t24
13439 // t23 = consecutive count 5
13440 // t24 = consecutive count 9
13441 DifferentValueMap[V] = ++ConsecutiveValCount;
13442 }
13443
13444 if (!Value.getNode()) {
13445 LLVM_DEBUG(
13446 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13447 return DAG.getUNDEF(VT);
13448 }
13449
13450 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13451 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13452 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13453 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13454 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13455 "SCALAR_TO_VECTOR node\n");
13456 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13457 }
13458
13459 if (AllLanesExtractElt) {
13460 SDNode *Vector = nullptr;
13461 bool Even = false;
13462 bool Odd = false;
13463 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13464 // the Odd pattern <1,3,5,...>.
13465 for (unsigned i = 0; i < NumElts; ++i) {
13466 SDValue V = Op.getOperand(i);
13467 const SDNode *N = V.getNode();
13468 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13469 Even = false;
13470 Odd = false;
13471 break;
13472 }
13473 SDValue N0 = N->getOperand(0);
13474
13475 // All elements are extracted from the same vector.
13476 if (!Vector) {
13477 Vector = N0.getNode();
13478 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13479 // BUILD_VECTOR.
13480 if (VT.getVectorElementType() !=
13482 break;
13483 } else if (Vector != N0.getNode()) {
13484 Odd = false;
13485 Even = false;
13486 break;
13487 }
13488
13489 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13490 // indices <1,3,5,...>.
13491 uint64_t Val = N->getConstantOperandVal(1);
13492 if (Val == 2 * i) {
13493 Even = true;
13494 continue;
13495 }
13496 if (Val - 1 == 2 * i) {
13497 Odd = true;
13498 continue;
13499 }
13500
13501 // Something does not match: abort.
13502 Odd = false;
13503 Even = false;
13504 break;
13505 }
13506 if (Even || Odd) {
13507 SDValue LHS =
13509 DAG.getConstant(0, dl, MVT::i64));
13510 SDValue RHS =
13512 DAG.getConstant(NumElts, dl, MVT::i64));
13513
13514 if (Even && !Odd)
13515 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13516 RHS);
13517 if (Odd && !Even)
13518 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13519 RHS);
13520 }
13521 }
13522
13523 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13524 // i32 and try again.
13525 if (usesOnlyOneValue) {
13526 if (!isConstant) {
13527 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13528 Value.getValueType() != VT) {
13529 LLVM_DEBUG(
13530 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13531 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13532 }
13533
13534 // This is actually a DUPLANExx operation, which keeps everything vectory.
13535
13536 SDValue Lane = Value.getOperand(1);
13537 Value = Value.getOperand(0);
13538 if (Value.getValueSizeInBits() == 64) {
13539 LLVM_DEBUG(
13540 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13541 "widening it\n");
13542 Value = WidenVector(Value, DAG);
13543 }
13544
13545 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13546 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13547 }
13548
13551 EVT EltTy = VT.getVectorElementType();
13552 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13553 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13554 LLVM_DEBUG(
13555 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13556 "BITCASTS, and try again\n");
13557 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13558 for (unsigned i = 0; i < NumElts; ++i)
13559 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13560 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13561 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13562 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13563 Val.dump(););
13564 Val = LowerBUILD_VECTOR(Val, DAG);
13565 if (Val.getNode())
13566 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13567 }
13568 }
13569
13570 // If we need to insert a small number of different non-constant elements and
13571 // the vector width is sufficiently large, prefer using DUP with the common
13572 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13573 // skip the constant lane handling below.
13574 bool PreferDUPAndInsert =
13575 !isConstant && NumDifferentLanes >= 1 &&
13576 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13577 NumDifferentLanes >= NumConstantLanes;
13578
13579 // If there was only one constant value used and for more than one lane,
13580 // start by splatting that value, then replace the non-constant lanes. This
13581 // is better than the default, which will perform a separate initialization
13582 // for each lane.
13583 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13584 // Firstly, try to materialize the splat constant.
13585 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13586 unsigned BitSize = VT.getScalarSizeInBits();
13587 APInt ConstantValueAPInt(1, 0);
13588 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13589 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13590 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13591 !ConstantValueAPInt.isAllOnes()) {
13592 Val = ConstantBuildVector(Val, DAG, Subtarget);
13593 if (!Val)
13594 // Otherwise, materialize the constant and splat it.
13595 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13596 }
13597
13598 // Now insert the non-constant lanes.
13599 for (unsigned i = 0; i < NumElts; ++i) {
13600 SDValue V = Op.getOperand(i);
13601 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13602 if (!isIntOrFPConstant(V))
13603 // Note that type legalization likely mucked about with the VT of the
13604 // source operand, so we may have to convert it here before inserting.
13605 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13606 }
13607 return Val;
13608 }
13609
13610 // This will generate a load from the constant pool.
13611 if (isConstant) {
13612 LLVM_DEBUG(
13613 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13614 "expansion\n");
13615 return SDValue();
13616 }
13617
13618 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13619 // v4i32s. This is really a truncate, which we can construct out of (legal)
13620 // concats and truncate nodes.
13622 return M;
13623
13624 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13625 if (NumElts >= 4) {
13626 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13627 return Shuffle;
13628
13629 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13630 return Shuffle;
13631 }
13632
13633 if (PreferDUPAndInsert) {
13634 // First, build a constant vector with the common element.
13635 SmallVector<SDValue, 8> Ops(NumElts, Value);
13636 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13637 // Next, insert the elements that do not match the common value.
13638 for (unsigned I = 0; I < NumElts; ++I)
13639 if (Op.getOperand(I) != Value)
13640 NewVector =
13641 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13642 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13643
13644 return NewVector;
13645 }
13646
13647 // If vector consists of two different values, try to generate two DUPs and
13648 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13649 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13651 // Check the consecutive count of the value is the half number of vector
13652 // elements. In this case, we can use CONCAT_VECTORS. For example,
13653 //
13654 // canUseVECTOR_CONCAT = true;
13655 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13656 // t24, t24, t24, t24, t24, t24, t24, t24
13657 //
13658 // canUseVECTOR_CONCAT = false;
13659 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13660 // t24, t24, t24, t24, t24, t24, t24, t24
13661 bool canUseVECTOR_CONCAT = true;
13662 for (auto Pair : DifferentValueMap) {
13663 // Check different values have same length which is NumElts / 2.
13664 if (Pair.second != NumElts / 2)
13665 canUseVECTOR_CONCAT = false;
13666 Vals.push_back(Pair.first);
13667 }
13668
13669 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13670 // CONCAT_VECTORs. For example,
13671 //
13672 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13673 // t24, t24, t24, t24, t24, t24, t24, t24
13674 // ==>
13675 // t26: v8i8 = AArch64ISD::DUP t23
13676 // t28: v8i8 = AArch64ISD::DUP t24
13677 // t29: v16i8 = concat_vectors t26, t28
13678 if (canUseVECTOR_CONCAT) {
13679 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13680 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13681 SubVT.getVectorNumElements() >= 2) {
13682 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13683 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13684 SDValue DUP1 =
13685 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13686 SDValue DUP2 =
13687 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13689 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13690 return CONCAT_VECTORS;
13691 }
13692 }
13693
13694 // Let's try to generate VECTOR_SHUFFLE. For example,
13695 //
13696 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13697 // ==>
13698 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13699 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13700 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13701 if (NumElts >= 8) {
13702 SmallVector<int, 16> MaskVec;
13703 // Build mask for VECTOR_SHUFLLE.
13704 SDValue FirstLaneVal = Op.getOperand(0);
13705 for (unsigned i = 0; i < NumElts; ++i) {
13706 SDValue Val = Op.getOperand(i);
13707 if (FirstLaneVal == Val)
13708 MaskVec.push_back(i);
13709 else
13710 MaskVec.push_back(i + NumElts);
13711 }
13712
13713 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13714 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13715 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13716 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13718 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13719 return VECTOR_SHUFFLE;
13720 }
13721 }
13722
13723 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13724 // know the default expansion would otherwise fall back on something even
13725 // worse. For a vector with one or two non-undef values, that's
13726 // scalar_to_vector for the elements followed by a shuffle (provided the
13727 // shuffle is valid for the target) and materialization element by element
13728 // on the stack followed by a load for everything else.
13729 if (!isConstant && !usesOnlyOneValue) {
13730 LLVM_DEBUG(
13731 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13732 "of INSERT_VECTOR_ELT\n");
13733
13734 SDValue Vec = DAG.getUNDEF(VT);
13735 SDValue Op0 = Op.getOperand(0);
13736 unsigned i = 0;
13737
13738 // Use SCALAR_TO_VECTOR for lane zero to
13739 // a) Avoid a RMW dependency on the full vector register, and
13740 // b) Allow the register coalescer to fold away the copy if the
13741 // value is already in an S or D register, and we're forced to emit an
13742 // INSERT_SUBREG that we can't fold anywhere.
13743 //
13744 // We also allow types like i8 and i16 which are illegal scalar but legal
13745 // vector element types. After type-legalization the inserted value is
13746 // extended (i32) and it is safe to cast them to the vector type by ignoring
13747 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13748 if (!Op0.isUndef()) {
13749 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13750 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13751 ++i;
13752 }
13753 LLVM_DEBUG(if (i < NumElts) dbgs()
13754 << "Creating nodes for the other vector elements:\n";);
13755 for (; i < NumElts; ++i) {
13756 SDValue V = Op.getOperand(i);
13757 if (V.isUndef())
13758 continue;
13759 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13760 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13761 }
13762 return Vec;
13763 }
13764
13765 LLVM_DEBUG(
13766 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13767 "better alternative\n");
13768 return SDValue();
13769}
13770
13771SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13772 SelectionDAG &DAG) const {
13773 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13774 !Subtarget->isNeonAvailable()))
13775 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13776
13777 assert(Op.getValueType().isScalableVector() &&
13778 isTypeLegal(Op.getValueType()) &&
13779 "Expected legal scalable vector type!");
13780
13781 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13782 unsigned NumOperands = Op->getNumOperands();
13783 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13784 "Unexpected number of operands in CONCAT_VECTORS");
13785
13786 if (NumOperands == 2)
13787 return Op;
13788
13789 // Concat each pair of subvectors and pack into the lower half of the array.
13790 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13791 while (ConcatOps.size() > 1) {
13792 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13793 SDValue V1 = ConcatOps[I];
13794 SDValue V2 = ConcatOps[I + 1];
13795 EVT SubVT = V1.getValueType();
13796 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13797 ConcatOps[I / 2] =
13798 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13799 }
13800 ConcatOps.resize(ConcatOps.size() / 2);
13801 }
13802 return ConcatOps[0];
13803 }
13804
13805 return SDValue();
13806}
13807
13808SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13809 SelectionDAG &DAG) const {
13810 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13811
13812 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13813 !Subtarget->isNeonAvailable()))
13814 return LowerFixedLengthInsertVectorElt(Op, DAG);
13815
13816 EVT VT = Op.getOperand(0).getValueType();
13817
13818 if (VT.getScalarType() == MVT::i1) {
13819 EVT VectorVT = getPromotedVTForPredicate(VT);
13820 SDLoc DL(Op);
13821 SDValue ExtendedVector =
13822 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13823 SDValue ExtendedValue =
13824 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13825 VectorVT.getScalarType().getSizeInBits() < 32
13826 ? MVT::i32
13827 : VectorVT.getScalarType());
13828 ExtendedVector =
13829 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13830 ExtendedValue, Op.getOperand(2));
13831 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13832 }
13833
13834 // Check for non-constant or out of range lane.
13835 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13836 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13837 return SDValue();
13838
13839 return Op;
13840}
13841
13842SDValue
13843AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13844 SelectionDAG &DAG) const {
13845 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13846 EVT VT = Op.getOperand(0).getValueType();
13847
13848 if (VT.getScalarType() == MVT::i1) {
13849 // We can't directly extract from an SVE predicate; extend it first.
13850 // (This isn't the only possible lowering, but it's straightforward.)
13851 EVT VectorVT = getPromotedVTForPredicate(VT);
13852 SDLoc DL(Op);
13853 SDValue Extend =
13854 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13855 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13856 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13857 Extend, Op.getOperand(1));
13858 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13859 }
13860
13861 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13862 return LowerFixedLengthExtractVectorElt(Op, DAG);
13863
13864 // Check for non-constant or out of range lane.
13865 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13866 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13867 return SDValue();
13868
13869 // Insertion/extraction are legal for V128 types.
13870 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13871 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13872 VT == MVT::v8f16 || VT == MVT::v8bf16)
13873 return Op;
13874
13875 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13876 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13877 VT != MVT::v4bf16)
13878 return SDValue();
13879
13880 // For V64 types, we perform extraction by expanding the value
13881 // to a V128 type and perform the extraction on that.
13882 SDLoc DL(Op);
13883 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13884 EVT WideTy = WideVec.getValueType();
13885
13886 EVT ExtrTy = WideTy.getVectorElementType();
13887 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13888 ExtrTy = MVT::i32;
13889
13890 // For extractions, we just return the result directly.
13891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13892 Op.getOperand(1));
13893}
13894
13895SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13896 SelectionDAG &DAG) const {
13897 assert(Op.getValueType().isFixedLengthVector() &&
13898 "Only cases that extract a fixed length vector are supported!");
13899
13900 EVT InVT = Op.getOperand(0).getValueType();
13901 unsigned Idx = Op.getConstantOperandVal(1);
13902 unsigned Size = Op.getValueSizeInBits();
13903
13904 // If we don't have legal types yet, do nothing
13905 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13906 return SDValue();
13907
13908 if (InVT.isScalableVector()) {
13909 // This will be matched by custom code during ISelDAGToDAG.
13910 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13911 return Op;
13912
13913 return SDValue();
13914 }
13915
13916 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13917 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13918 return Op;
13919
13920 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13921 // that directly.
13922 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13923 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13924 return Op;
13925
13926 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13927 SDLoc DL(Op);
13928
13929 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13930 SDValue NewInVec =
13931 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13932
13933 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13934 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13935 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13936 }
13937
13938 return SDValue();
13939}
13940
13941SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13942 SelectionDAG &DAG) const {
13943 assert(Op.getValueType().isScalableVector() &&
13944 "Only expect to lower inserts into scalable vectors!");
13945
13946 EVT InVT = Op.getOperand(1).getValueType();
13947 unsigned Idx = Op.getConstantOperandVal(2);
13948
13949 SDValue Vec0 = Op.getOperand(0);
13950 SDValue Vec1 = Op.getOperand(1);
13951 SDLoc DL(Op);
13952 EVT VT = Op.getValueType();
13953
13954 if (InVT.isScalableVector()) {
13955 if (!isTypeLegal(VT))
13956 return SDValue();
13957
13958 // Break down insert_subvector into simpler parts.
13959 if (VT.getVectorElementType() == MVT::i1) {
13960 unsigned NumElts = VT.getVectorMinNumElements();
13961 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13962
13963 SDValue Lo, Hi;
13964 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13965 DAG.getVectorIdxConstant(0, DL));
13966 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13967 DAG.getVectorIdxConstant(NumElts / 2, DL));
13968 if (Idx < (NumElts / 2)) {
13969 SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13971 return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
13972 } else {
13973 SDValue NewHi =
13974 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13975 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13976 return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
13977 }
13978 }
13979
13980 // Ensure the subvector is half the size of the main vector.
13981 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13982 return SDValue();
13983
13984 // Here narrow and wide refers to the vector element types. After "casting"
13985 // both vectors must have the same bit length and so because the subvector
13986 // has fewer elements, those elements need to be bigger.
13989
13990 // NOP cast operands to the largest legal vector of the same element count.
13991 if (VT.isFloatingPoint()) {
13992 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13993 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13994 } else {
13995 // Legal integer vectors are already their largest so Vec0 is fine as is.
13996 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13997 }
13998
13999 // To replace the top/bottom half of vector V with vector SubV we widen the
14000 // preserved half of V, concatenate this to SubV (the order depending on the
14001 // half being replaced) and then narrow the result.
14002 SDValue Narrow;
14003 if (Idx == 0) {
14004 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
14005 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
14006 } else {
14008 "Invalid subvector index!");
14009 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
14010 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14011 }
14012
14013 return getSVESafeBitCast(VT, Narrow, DAG);
14014 }
14015
14016 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14017 // This will be matched by custom code during ISelDAGToDAG.
14018 if (Vec0.isUndef())
14019 return Op;
14020
14021 std::optional<unsigned> PredPattern =
14023 auto PredTy = VT.changeVectorElementType(MVT::i1);
14024 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14025 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14026 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14027 }
14028
14029 return SDValue();
14030}
14031
14032static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14033 if (Op.getOpcode() != AArch64ISD::DUP &&
14034 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14035 Op.getOpcode() != ISD::BUILD_VECTOR)
14036 return false;
14037
14038 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14039 !isAllConstantBuildVector(Op, SplatVal))
14040 return false;
14041
14042 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14043 !isa<ConstantSDNode>(Op->getOperand(0)))
14044 return false;
14045
14046 SplatVal = Op->getConstantOperandVal(0);
14047 if (Op.getValueType().getVectorElementType() != MVT::i64)
14048 SplatVal = (int32_t)SplatVal;
14049
14050 Negated = false;
14051 if (isPowerOf2_64(SplatVal))
14052 return true;
14053
14054 Negated = true;
14055 if (isPowerOf2_64(-SplatVal)) {
14056 SplatVal = -SplatVal;
14057 return true;
14058 }
14059
14060 return false;
14061}
14062
14063SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14064 EVT VT = Op.getValueType();
14065 SDLoc dl(Op);
14066
14067 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14068 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14069
14070 assert(VT.isScalableVector() && "Expected a scalable vector.");
14071
14072 bool Signed = Op.getOpcode() == ISD::SDIV;
14073 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14074
14075 bool Negated;
14076 uint64_t SplatVal;
14077 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14078 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14079 SDValue Res =
14080 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14081 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14082 if (Negated)
14083 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14084
14085 return Res;
14086 }
14087
14088 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14089 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14090
14091 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14092 // operations, and truncate the result.
14093 EVT WidenedVT;
14094 if (VT == MVT::nxv16i8)
14095 WidenedVT = MVT::nxv8i16;
14096 else if (VT == MVT::nxv8i16)
14097 WidenedVT = MVT::nxv4i32;
14098 else
14099 llvm_unreachable("Unexpected Custom DIV operation");
14100
14101 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14102 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14103 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14104 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14105 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14106 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14107 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14108 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14109 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14110}
14111
14113 // Currently no fixed length shuffles that require SVE are legal.
14114 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14115 return false;
14116
14117 if (VT.getVectorNumElements() == 4 &&
14118 (VT.is128BitVector() || VT.is64BitVector())) {
14119 unsigned Cost = getPerfectShuffleCost(M);
14120 if (Cost <= 1)
14121 return true;
14122 }
14123
14124 bool DummyBool;
14125 int DummyInt;
14126 unsigned DummyUnsigned;
14127
14128 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
14129 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
14130 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14131 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
14132 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
14133 isZIPMask(M, VT, DummyUnsigned) ||
14134 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14135 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14136 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14137 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
14138 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14139}
14140
14142 EVT VT) const {
14143 // Just delegate to the generic legality, clear masks aren't special.
14144 return isShuffleMaskLegal(M, VT);
14145}
14146
14147/// getVShiftImm - Check if this is a valid build_vector for the immediate
14148/// operand of a vector shift operation, where all the elements of the
14149/// build_vector must have the same constant integer value.
14150static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14151 // Ignore bit_converts.
14152 while (Op.getOpcode() == ISD::BITCAST)
14153 Op = Op.getOperand(0);
14154 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14155 APInt SplatBits, SplatUndef;
14156 unsigned SplatBitSize;
14157 bool HasAnyUndefs;
14158 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14159 HasAnyUndefs, ElementBits) ||
14160 SplatBitSize > ElementBits)
14161 return false;
14162 Cnt = SplatBits.getSExtValue();
14163 return true;
14164}
14165
14166/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14167/// operand of a vector shift left operation. That value must be in the range:
14168/// 0 <= Value < ElementBits for a left shift; or
14169/// 0 <= Value <= ElementBits for a long left shift.
14170static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14171 assert(VT.isVector() && "vector shift count is not a vector type");
14172 int64_t ElementBits = VT.getScalarSizeInBits();
14173 if (!getVShiftImm(Op, ElementBits, Cnt))
14174 return false;
14175 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14176}
14177
14178/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14179/// operand of a vector shift right operation. The value must be in the range:
14180/// 1 <= Value <= ElementBits for a right shift; or
14181static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14182 assert(VT.isVector() && "vector shift count is not a vector type");
14183 int64_t ElementBits = VT.getScalarSizeInBits();
14184 if (!getVShiftImm(Op, ElementBits, Cnt))
14185 return false;
14186 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14187}
14188
14189SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14190 SelectionDAG &DAG) const {
14191 EVT VT = Op.getValueType();
14192
14193 if (VT.getScalarType() == MVT::i1) {
14194 // Lower i1 truncate to `(x & 1) != 0`.
14195 SDLoc dl(Op);
14196 EVT OpVT = Op.getOperand(0).getValueType();
14197 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14198 SDValue One = DAG.getConstant(1, dl, OpVT);
14199 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14200 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14201 }
14202
14203 if (!VT.isVector() || VT.isScalableVector())
14204 return SDValue();
14205
14206 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14207 !Subtarget->isNeonAvailable()))
14208 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14209
14210 return SDValue();
14211}
14212
14213// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14214// possibly a truncated type, it tells how many bits of the value are to be
14215// used.
14217 SelectionDAG &DAG,
14218 unsigned &ShiftValue,
14219 SDValue &RShOperand) {
14220 if (Shift->getOpcode() != ISD::SRL)
14221 return false;
14222
14223 EVT VT = Shift.getValueType();
14224 assert(VT.isScalableVT());
14225
14226 auto ShiftOp1 =
14227 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14228 if (!ShiftOp1)
14229 return false;
14230
14231 ShiftValue = ShiftOp1->getZExtValue();
14232 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14233 return false;
14234
14235 SDValue Add = Shift->getOperand(0);
14236 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14237 return false;
14238
14240 "ResVT must be truncated or same type as the shift.");
14241 // Check if an overflow can lead to incorrect results.
14242 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14243 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14244 return false;
14245
14246 auto AddOp1 =
14247 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14248 if (!AddOp1)
14249 return false;
14250 uint64_t AddValue = AddOp1->getZExtValue();
14251 if (AddValue != 1ULL << (ShiftValue - 1))
14252 return false;
14253
14254 RShOperand = Add->getOperand(0);
14255 return true;
14256}
14257
14258SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14259 SelectionDAG &DAG) const {
14260 EVT VT = Op.getValueType();
14261 SDLoc DL(Op);
14262 int64_t Cnt;
14263
14264 if (!Op.getOperand(1).getValueType().isVector())
14265 return Op;
14266 unsigned EltSize = VT.getScalarSizeInBits();
14267
14268 switch (Op.getOpcode()) {
14269 case ISD::SHL:
14270 if (VT.isScalableVector() ||
14272 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14273
14274 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14275 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14276 DAG.getConstant(Cnt, DL, MVT::i32));
14277 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14278 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14279 MVT::i32),
14280 Op.getOperand(0), Op.getOperand(1));
14281 case ISD::SRA:
14282 case ISD::SRL:
14283 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14284 SDValue RShOperand;
14285 unsigned ShiftValue;
14286 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14287 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14288 getPredicateForVector(DAG, DL, VT), RShOperand,
14289 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14290 }
14291
14292 if (VT.isScalableVector() ||
14293 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14294 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14296 return LowerToPredicatedOp(Op, DAG, Opc);
14297 }
14298
14299 // Right shift immediate
14300 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14301 unsigned Opc =
14302 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14303 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14304 DAG.getConstant(Cnt, DL, MVT::i32));
14305 }
14306
14307 // Right shift register. Note, there is not a shift right register
14308 // instruction, but the shift left register instruction takes a signed
14309 // value, where negative numbers specify a right shift.
14310 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14311 : Intrinsic::aarch64_neon_ushl;
14312 // negate the shift amount
14313 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14314 Op.getOperand(1));
14315 SDValue NegShiftLeft =
14317 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14318 NegShift);
14319 return NegShiftLeft;
14320 }
14321
14322 llvm_unreachable("unexpected shift opcode");
14323}
14324
14326 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14327 const SDLoc &dl, SelectionDAG &DAG) {
14328 EVT SrcVT = LHS.getValueType();
14329 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14330 "function only supposed to emit natural comparisons");
14331
14332 APInt SplatValue;
14333 APInt SplatUndef;
14334 unsigned SplatBitSize = 0;
14335 bool HasAnyUndefs;
14336
14337 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14338 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14339 SplatBitSize, HasAnyUndefs);
14340
14341 bool IsZero = IsCnst && SplatValue == 0;
14342 bool IsOne =
14343 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14344 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14345
14346 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14347 switch (CC) {
14348 default:
14349 return SDValue();
14350 case AArch64CC::NE: {
14351 SDValue Fcmeq;
14352 if (IsZero)
14353 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14354 else
14355 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14356 return DAG.getNOT(dl, Fcmeq, VT);
14357 }
14358 case AArch64CC::EQ:
14359 if (IsZero)
14360 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14361 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14362 case AArch64CC::GE:
14363 if (IsZero)
14364 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14365 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14366 case AArch64CC::GT:
14367 if (IsZero)
14368 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14369 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14370 case AArch64CC::LE:
14371 if (!NoNans)
14372 return SDValue();
14373 // If we ignore NaNs then we can use to the LS implementation.
14374 [[fallthrough]];
14375 case AArch64CC::LS:
14376 if (IsZero)
14377 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14378 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14379 case AArch64CC::LT:
14380 if (!NoNans)
14381 return SDValue();
14382 // If we ignore NaNs then we can use to the MI implementation.
14383 [[fallthrough]];
14384 case AArch64CC::MI:
14385 if (IsZero)
14386 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14387 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14388 }
14389 }
14390
14391 switch (CC) {
14392 default:
14393 return SDValue();
14394 case AArch64CC::NE: {
14395 SDValue Cmeq;
14396 if (IsZero)
14397 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14398 else
14399 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14400 return DAG.getNOT(dl, Cmeq, VT);
14401 }
14402 case AArch64CC::EQ:
14403 if (IsZero)
14404 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14405 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14406 case AArch64CC::GE:
14407 if (IsZero)
14408 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14409 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14410 case AArch64CC::GT:
14411 if (IsZero)
14412 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14413 if (IsMinusOne)
14414 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14415 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14416 case AArch64CC::LE:
14417 if (IsZero)
14418 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14419 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14420 case AArch64CC::LS:
14421 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14422 case AArch64CC::LO:
14423 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14424 case AArch64CC::LT:
14425 if (IsZero)
14426 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14427 if (IsOne)
14428 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14429 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14430 case AArch64CC::HI:
14431 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14432 case AArch64CC::HS:
14433 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14434 }
14435}
14436
14437SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14438 SelectionDAG &DAG) const {
14439 if (Op.getValueType().isScalableVector())
14440 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14441
14442 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14443 !Subtarget->isNeonAvailable()))
14444 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14445
14446 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14447 SDValue LHS = Op.getOperand(0);
14448 SDValue RHS = Op.getOperand(1);
14449 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14450 SDLoc dl(Op);
14451
14452 if (LHS.getValueType().getVectorElementType().isInteger()) {
14453 assert(LHS.getValueType() == RHS.getValueType());
14455 SDValue Cmp =
14456 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14457 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14458 }
14459
14460 // Lower isnan(x) | isnan(never-nan) to x != x.
14461 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14462 if (CC == ISD::SETUO || CC == ISD::SETO) {
14463 bool OneNaN = false;
14464 if (LHS == RHS) {
14465 OneNaN = true;
14466 } else if (DAG.isKnownNeverNaN(RHS)) {
14467 OneNaN = true;
14468 RHS = LHS;
14469 } else if (DAG.isKnownNeverNaN(LHS)) {
14470 OneNaN = true;
14471 LHS = RHS;
14472 }
14473 if (OneNaN) {
14475 }
14476 }
14477
14478 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14479
14480 // Make v4f16 (only) fcmp operations utilise vector instructions
14481 // v8f16 support will be a litle more complicated
14482 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14483 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14484 if (LHS.getValueType().getVectorNumElements() == 4) {
14485 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14486 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14487 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14488 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14489 CmpVT = MVT::v4i32;
14490 } else
14491 return SDValue();
14492 }
14493
14494 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14495 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14496 LHS.getValueType().getVectorElementType() != MVT::f128);
14497
14498 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14499 // clean. Some of them require two branches to implement.
14500 AArch64CC::CondCode CC1, CC2;
14501 bool ShouldInvert;
14502 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14503
14504 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14505 SDValue Cmp =
14506 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14507 if (!Cmp.getNode())
14508 return SDValue();
14509
14510 if (CC2 != AArch64CC::AL) {
14511 SDValue Cmp2 =
14512 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14513 if (!Cmp2.getNode())
14514 return SDValue();
14515
14516 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14517 }
14518
14519 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14520
14521 if (ShouldInvert)
14522 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14523
14524 return Cmp;
14525}
14526
14527static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14528 SelectionDAG &DAG) {
14529 SDValue VecOp = ScalarOp.getOperand(0);
14530 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14531 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14532 DAG.getConstant(0, DL, MVT::i64));
14533}
14534
14535static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14536 SDLoc DL, SelectionDAG &DAG) {
14537 unsigned ScalarOpcode;
14538 switch (Opcode) {
14539 case ISD::VECREDUCE_AND:
14540 ScalarOpcode = ISD::AND;
14541 break;
14542 case ISD::VECREDUCE_OR:
14543 ScalarOpcode = ISD::OR;
14544 break;
14545 case ISD::VECREDUCE_XOR:
14546 ScalarOpcode = ISD::XOR;
14547 break;
14548 default:
14549 llvm_unreachable("Expected bitwise vector reduction");
14550 return SDValue();
14551 }
14552
14553 EVT VecVT = Vec.getValueType();
14554 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14555 "Expected power-of-2 length vector");
14556
14557 EVT ElemVT = VecVT.getVectorElementType();
14558
14559 SDValue Result;
14560 unsigned NumElems = VecVT.getVectorNumElements();
14561
14562 // Special case for boolean reductions
14563 if (ElemVT == MVT::i1) {
14564 // Split large vectors into smaller ones
14565 if (NumElems > 16) {
14566 SDValue Lo, Hi;
14567 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14568 EVT HalfVT = Lo.getValueType();
14569 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14570 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14571 }
14572
14573 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14574 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14575 // this element size leads to the best codegen, since e.g. setcc results
14576 // might need to be truncated otherwise.
14577 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14578
14579 // any_ext doesn't work with umin/umax, so only use it for uadd.
14580 unsigned ExtendOp =
14581 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14582 SDValue Extended = DAG.getNode(
14583 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14584 switch (ScalarOpcode) {
14585 case ISD::AND:
14586 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14587 break;
14588 case ISD::OR:
14589 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14590 break;
14591 case ISD::XOR:
14592 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14593 break;
14594 default:
14595 llvm_unreachable("Unexpected Opcode");
14596 }
14597
14598 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14599 } else {
14600 // Iteratively split the vector in half and combine using the bitwise
14601 // operation until it fits in a 64 bit register.
14602 while (VecVT.getSizeInBits() > 64) {
14603 SDValue Lo, Hi;
14604 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14605 VecVT = Lo.getValueType();
14606 NumElems = VecVT.getVectorNumElements();
14607 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14608 }
14609
14610 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14611
14612 // Do the remaining work on a scalar since it allows the code generator to
14613 // combine the shift and bitwise operation into one instruction and since
14614 // integer instructions can have higher throughput than vector instructions.
14615 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14616
14617 // Iteratively combine the lower and upper halves of the scalar using the
14618 // bitwise operation, halving the relevant region of the scalar in each
14619 // iteration, until the relevant region is just one element of the original
14620 // vector.
14621 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14622 SDValue ShiftAmount =
14623 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14624 SDValue Shifted =
14625 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14626 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14627 }
14628
14629 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14630 }
14631
14632 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14633}
14634
14635SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14636 SelectionDAG &DAG) const {
14637 SDValue Src = Op.getOperand(0);
14638
14639 // Try to lower fixed length reductions to SVE.
14640 EVT SrcVT = Src.getValueType();
14641 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14642 Op.getOpcode() == ISD::VECREDUCE_AND ||
14643 Op.getOpcode() == ISD::VECREDUCE_OR ||
14644 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14645 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14646 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14647 SrcVT.getVectorElementType() == MVT::i64);
14648 if (SrcVT.isScalableVector() ||
14650 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14651
14652 if (SrcVT.getVectorElementType() == MVT::i1)
14653 return LowerPredReductionToSVE(Op, DAG);
14654
14655 switch (Op.getOpcode()) {
14656 case ISD::VECREDUCE_ADD:
14657 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14658 case ISD::VECREDUCE_AND:
14659 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14660 case ISD::VECREDUCE_OR:
14661 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14663 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14665 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14667 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14669 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14670 case ISD::VECREDUCE_XOR:
14671 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14673 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14675 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14677 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14679 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14681 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14682 default:
14683 llvm_unreachable("Unhandled fixed length reduction");
14684 }
14685 }
14686
14687 // Lower NEON reductions.
14688 SDLoc dl(Op);
14689 switch (Op.getOpcode()) {
14690 case ISD::VECREDUCE_AND:
14691 case ISD::VECREDUCE_OR:
14692 case ISD::VECREDUCE_XOR:
14693 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14694 Op.getValueType(), dl, DAG);
14695 case ISD::VECREDUCE_ADD:
14696 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14698 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14700 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14702 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14704 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14705 default:
14706 llvm_unreachable("Unhandled reduction");
14707 }
14708}
14709
14710SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14711 SelectionDAG &DAG) const {
14712 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14713 // No point replacing if we don't have the relevant instruction/libcall anyway
14714 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14715 return SDValue();
14716
14717 // LSE has an atomic load-clear instruction, but not a load-and.
14718 SDLoc dl(Op);
14719 MVT VT = Op.getSimpleValueType();
14720 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14721 SDValue RHS = Op.getOperand(2);
14722 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14723 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14724 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14725 Op.getOperand(0), Op.getOperand(1), RHS,
14726 AN->getMemOperand());
14727}
14728
14729SDValue
14730AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14731 SelectionDAG &DAG) const {
14732
14733 SDLoc dl(Op);
14734 // Get the inputs.
14735 SDNode *Node = Op.getNode();
14736 SDValue Chain = Op.getOperand(0);
14737 SDValue Size = Op.getOperand(1);
14739 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14740 EVT VT = Node->getValueType(0);
14741
14743 "no-stack-arg-probe")) {
14744 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14745 Chain = SP.getValue(1);
14746 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14747 if (Align)
14748 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14749 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14750 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14751 SDValue Ops[2] = {SP, Chain};
14752 return DAG.getMergeValues(Ops, dl);
14753 }
14754
14755 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14756
14757 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14759 PtrVT, 0);
14760
14761 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14762 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14763 if (Subtarget->hasCustomCallingConv())
14764 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14765
14766 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14767 DAG.getConstant(4, dl, MVT::i64));
14768 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14769 Chain =
14770 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14771 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14772 DAG.getRegisterMask(Mask), Chain.getValue(1));
14773 // To match the actual intent better, we should read the output from X15 here
14774 // again (instead of potentially spilling it to the stack), but rereading Size
14775 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14776 // here.
14777
14778 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14779 DAG.getConstant(4, dl, MVT::i64));
14780
14781 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14782 Chain = SP.getValue(1);
14783 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14784 if (Align)
14785 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14786 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14787 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14788
14789 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14790
14791 SDValue Ops[2] = {SP, Chain};
14792 return DAG.getMergeValues(Ops, dl);
14793}
14794
14795SDValue
14796AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14797 SelectionDAG &DAG) const {
14798 // Get the inputs.
14799 SDNode *Node = Op.getNode();
14800 SDValue Chain = Op.getOperand(0);
14801 SDValue Size = Op.getOperand(1);
14802
14804 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14805 SDLoc dl(Op);
14806 EVT VT = Node->getValueType(0);
14807
14808 // Construct the new SP value in a GPR.
14809 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14810 Chain = SP.getValue(1);
14811 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14812 if (Align)
14813 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14814 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14815
14816 // Set the real SP to the new value with a probing loop.
14817 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14818 SDValue Ops[2] = {SP, Chain};
14819 return DAG.getMergeValues(Ops, dl);
14820}
14821
14822SDValue
14823AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14824 SelectionDAG &DAG) const {
14826
14827 if (Subtarget->isTargetWindows())
14828 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14829 else if (hasInlineStackProbe(MF))
14830 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14831 else
14832 return SDValue();
14833}
14834
14835// When x and y are extended, lower:
14836// avgfloor(x, y) -> (x + y) >> 1
14837// avgceil(x, y) -> (x + y + 1) >> 1
14838
14839// Otherwise, lower to:
14840// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14841// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14842SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14843 unsigned NewOp) const {
14844 if (Subtarget->hasSVE2())
14845 return LowerToPredicatedOp(Op, DAG, NewOp);
14846
14847 SDLoc dl(Op);
14848 SDValue OpA = Op->getOperand(0);
14849 SDValue OpB = Op->getOperand(1);
14850 EVT VT = Op.getValueType();
14851 bool IsCeil =
14852 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14853 bool IsSigned =
14854 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14855 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14856
14857 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14858
14859 auto IsZeroExtended = [&DAG](SDValue &Node) {
14860 KnownBits Known = DAG.computeKnownBits(Node, 0);
14861 return Known.Zero.isSignBitSet();
14862 };
14863
14864 auto IsSignExtended = [&DAG](SDValue &Node) {
14865 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14866 };
14867
14868 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14869 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14870 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14871 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14872 if (IsCeil)
14873 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14874 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14875 }
14876
14877 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14878 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14879
14880 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14881 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14882 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14883 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14884}
14885
14886SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14887 SelectionDAG &DAG) const {
14888 EVT VT = Op.getValueType();
14889 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14890
14891 SDLoc DL(Op);
14892 APInt MulImm = Op.getConstantOperandAPInt(0);
14893 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14894 VT);
14895}
14896
14897/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14898template <unsigned NumVecs>
14899static bool
14903 // Retrieve EC from first vector argument.
14904 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14906#ifndef NDEBUG
14907 // Check the assumption that all input vectors are the same type.
14908 for (unsigned I = 0; I < NumVecs; ++I)
14909 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14910 "Invalid type.");
14911#endif
14912 // memVT is `NumVecs * VT`.
14914 EC * NumVecs);
14915 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14916 Info.offset = 0;
14917 Info.align.reset();
14919 return true;
14920}
14921
14922/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14923/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14924/// specified in the intrinsic calls.
14926 const CallInst &I,
14927 MachineFunction &MF,
14928 unsigned Intrinsic) const {
14929 auto &DL = I.getModule()->getDataLayout();
14930 switch (Intrinsic) {
14931 case Intrinsic::aarch64_sve_st2:
14932 return setInfoSVEStN<2>(*this, DL, Info, I);
14933 case Intrinsic::aarch64_sve_st3:
14934 return setInfoSVEStN<3>(*this, DL, Info, I);
14935 case Intrinsic::aarch64_sve_st4:
14936 return setInfoSVEStN<4>(*this, DL, Info, I);
14937 case Intrinsic::aarch64_neon_ld2:
14938 case Intrinsic::aarch64_neon_ld3:
14939 case Intrinsic::aarch64_neon_ld4:
14940 case Intrinsic::aarch64_neon_ld1x2:
14941 case Intrinsic::aarch64_neon_ld1x3:
14942 case Intrinsic::aarch64_neon_ld1x4: {
14944 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14945 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14946 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14947 Info.offset = 0;
14948 Info.align.reset();
14949 // volatile loads with NEON intrinsics not supported
14951 return true;
14952 }
14953 case Intrinsic::aarch64_neon_ld2lane:
14954 case Intrinsic::aarch64_neon_ld3lane:
14955 case Intrinsic::aarch64_neon_ld4lane:
14956 case Intrinsic::aarch64_neon_ld2r:
14957 case Intrinsic::aarch64_neon_ld3r:
14958 case Intrinsic::aarch64_neon_ld4r: {
14960 // ldx return struct with the same vec type
14961 Type *RetTy = I.getType();
14962 auto *StructTy = cast<StructType>(RetTy);
14963 unsigned NumElts = StructTy->getNumElements();
14964 Type *VecTy = StructTy->getElementType(0);
14965 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14966 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14967 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14968 Info.offset = 0;
14969 Info.align.reset();
14970 // volatile loads with NEON intrinsics not supported
14972 return true;
14973 }
14974 case Intrinsic::aarch64_neon_st2:
14975 case Intrinsic::aarch64_neon_st3:
14976 case Intrinsic::aarch64_neon_st4:
14977 case Intrinsic::aarch64_neon_st1x2:
14978 case Intrinsic::aarch64_neon_st1x3:
14979 case Intrinsic::aarch64_neon_st1x4: {
14981 unsigned NumElts = 0;
14982 for (const Value *Arg : I.args()) {
14983 Type *ArgTy = Arg->getType();
14984 if (!ArgTy->isVectorTy())
14985 break;
14986 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14987 }
14988 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14989 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14990 Info.offset = 0;
14991 Info.align.reset();
14992 // volatile stores with NEON intrinsics not supported
14994 return true;
14995 }
14996 case Intrinsic::aarch64_neon_st2lane:
14997 case Intrinsic::aarch64_neon_st3lane:
14998 case Intrinsic::aarch64_neon_st4lane: {
15000 unsigned NumElts = 0;
15001 // all the vector type is same
15002 Type *VecTy = I.getArgOperand(0)->getType();
15003 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15004
15005 for (const Value *Arg : I.args()) {
15006 Type *ArgTy = Arg->getType();
15007 if (!ArgTy->isVectorTy())
15008 break;
15009 NumElts += 1;
15010 }
15011
15012 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15013 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15014 Info.offset = 0;
15015 Info.align.reset();
15016 // volatile stores with NEON intrinsics not supported
15018 return true;
15019 }
15020 case Intrinsic::aarch64_ldaxr:
15021 case Intrinsic::aarch64_ldxr: {
15022 Type *ValTy = I.getParamElementType(0);
15024 Info.memVT = MVT::getVT(ValTy);
15025 Info.ptrVal = I.getArgOperand(0);
15026 Info.offset = 0;
15027 Info.align = DL.getABITypeAlign(ValTy);
15029 return true;
15030 }
15031 case Intrinsic::aarch64_stlxr:
15032 case Intrinsic::aarch64_stxr: {
15033 Type *ValTy = I.getParamElementType(1);
15035 Info.memVT = MVT::getVT(ValTy);
15036 Info.ptrVal = I.getArgOperand(1);
15037 Info.offset = 0;
15038 Info.align = DL.getABITypeAlign(ValTy);
15040 return true;
15041 }
15042 case Intrinsic::aarch64_ldaxp:
15043 case Intrinsic::aarch64_ldxp:
15045 Info.memVT = MVT::i128;
15046 Info.ptrVal = I.getArgOperand(0);
15047 Info.offset = 0;
15048 Info.align = Align(16);
15050 return true;
15051 case Intrinsic::aarch64_stlxp:
15052 case Intrinsic::aarch64_stxp:
15054 Info.memVT = MVT::i128;
15055 Info.ptrVal = I.getArgOperand(2);
15056 Info.offset = 0;
15057 Info.align = Align(16);
15059 return true;
15060 case Intrinsic::aarch64_sve_ldnt1: {
15061 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15063 Info.memVT = MVT::getVT(I.getType());
15064 Info.ptrVal = I.getArgOperand(1);
15065 Info.offset = 0;
15066 Info.align = DL.getABITypeAlign(ElTy);
15068 return true;
15069 }
15070 case Intrinsic::aarch64_sve_stnt1: {
15071 Type *ElTy =
15072 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15074 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15075 Info.ptrVal = I.getArgOperand(2);
15076 Info.offset = 0;
15077 Info.align = DL.getABITypeAlign(ElTy);
15079 return true;
15080 }
15081 case Intrinsic::aarch64_mops_memset_tag: {
15082 Value *Dst = I.getArgOperand(0);
15083 Value *Val = I.getArgOperand(1);
15085 Info.memVT = MVT::getVT(Val->getType());
15086 Info.ptrVal = Dst;
15087 Info.offset = 0;
15088 Info.align = I.getParamAlign(0).valueOrOne();
15090 // The size of the memory being operated on is unknown at this point
15092 return true;
15093 }
15094 default:
15095 break;
15096 }
15097
15098 return false;
15099}
15100
15102 ISD::LoadExtType ExtTy,
15103 EVT NewVT) const {
15104 // TODO: This may be worth removing. Check regression tests for diffs.
15105 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15106 return false;
15107
15108 // If we're reducing the load width in order to avoid having to use an extra
15109 // instruction to do extension then it's probably a good idea.
15110 if (ExtTy != ISD::NON_EXTLOAD)
15111 return true;
15112 // Don't reduce load width if it would prevent us from combining a shift into
15113 // the offset.
15114 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15115 assert(Mem);
15116 const SDValue &Base = Mem->getBasePtr();
15117 if (Base.getOpcode() == ISD::ADD &&
15118 Base.getOperand(1).getOpcode() == ISD::SHL &&
15119 Base.getOperand(1).hasOneUse() &&
15120 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15121 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15122 if (Mem->getMemoryVT().isScalableVector())
15123 return false;
15124 // The shift can be combined if it matches the size of the value being
15125 // loaded (and so reducing the width would make it not match).
15126 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15127 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15128 if (ShiftAmount == Log2_32(LoadBytes))
15129 return false;
15130 }
15131 // We have no reason to disallow reducing the load width, so allow it.
15132 return true;
15133}
15134
15135// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15137 EVT VT = Extend.getValueType();
15138 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15139 SDValue Extract = Extend.getOperand(0);
15140 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15141 Extract = Extract.getOperand(0);
15142 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15143 EVT VecVT = Extract.getOperand(0).getValueType();
15144 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15145 return false;
15146 }
15147 }
15148 return true;
15149}
15150
15151// Truncations from 64-bit GPR to 32-bit GPR is free.
15153 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15154 return false;
15155 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15156 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15157 return NumBits1 > NumBits2;
15158}
15160 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15161 return false;
15162 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15163 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15164 return NumBits1 > NumBits2;
15165}
15166
15167/// Check if it is profitable to hoist instruction in then/else to if.
15168/// Not profitable if I and it's user can form a FMA instruction
15169/// because we prefer FMSUB/FMADD.
15171 if (I->getOpcode() != Instruction::FMul)
15172 return true;
15173
15174 if (!I->hasOneUse())
15175 return true;
15176
15177 Instruction *User = I->user_back();
15178
15179 if (!(User->getOpcode() == Instruction::FSub ||
15180 User->getOpcode() == Instruction::FAdd))
15181 return true;
15182
15184 const Function *F = I->getFunction();
15185 const DataLayout &DL = F->getParent()->getDataLayout();
15186 Type *Ty = User->getOperand(0)->getType();
15187
15188 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15190 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15191 Options.UnsafeFPMath));
15192}
15193
15194// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15195// 64-bit GPR.
15197 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15198 return false;
15199 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15200 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15201 return NumBits1 == 32 && NumBits2 == 64;
15202}
15204 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15205 return false;
15206 unsigned NumBits1 = VT1.getSizeInBits();
15207 unsigned NumBits2 = VT2.getSizeInBits();
15208 return NumBits1 == 32 && NumBits2 == 64;
15209}
15210
15212 EVT VT1 = Val.getValueType();
15213 if (isZExtFree(VT1, VT2)) {
15214 return true;
15215 }
15216
15217 if (Val.getOpcode() != ISD::LOAD)
15218 return false;
15219
15220 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15221 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15222 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15223 VT1.getSizeInBits() <= 32);
15224}
15225
15226bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15227 if (isa<FPExtInst>(Ext))
15228 return false;
15229
15230 // Vector types are not free.
15231 if (Ext->getType()->isVectorTy())
15232 return false;
15233
15234 for (const Use &U : Ext->uses()) {
15235 // The extension is free if we can fold it with a left shift in an
15236 // addressing mode or an arithmetic operation: add, sub, and cmp.
15237
15238 // Is there a shift?
15239 const Instruction *Instr = cast<Instruction>(U.getUser());
15240
15241 // Is this a constant shift?
15242 switch (Instr->getOpcode()) {
15243 case Instruction::Shl:
15244 if (!isa<ConstantInt>(Instr->getOperand(1)))
15245 return false;
15246 break;
15247 case Instruction::GetElementPtr: {
15248 gep_type_iterator GTI = gep_type_begin(Instr);
15249 auto &DL = Ext->getModule()->getDataLayout();
15250 std::advance(GTI, U.getOperandNo()-1);
15251 Type *IdxTy = GTI.getIndexedType();
15252 // This extension will end up with a shift because of the scaling factor.
15253 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15254 // Get the shift amount based on the scaling factor:
15255 // log2(sizeof(IdxTy)) - log2(8).
15256 if (IdxTy->isScalableTy())
15257 return false;
15258 uint64_t ShiftAmt =
15259 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15260 3;
15261 // Is the constant foldable in the shift of the addressing mode?
15262 // I.e., shift amount is between 1 and 4 inclusive.
15263 if (ShiftAmt == 0 || ShiftAmt > 4)
15264 return false;
15265 break;
15266 }
15267 case Instruction::Trunc:
15268 // Check if this is a noop.
15269 // trunc(sext ty1 to ty2) to ty1.
15270 if (Instr->getType() == Ext->getOperand(0)->getType())
15271 continue;
15272 [[fallthrough]];
15273 default:
15274 return false;
15275 }
15276
15277 // At this point we can use the bfm family, so this extension is free
15278 // for that use.
15279 }
15280 return true;
15281}
15282
15283static bool isSplatShuffle(Value *V) {
15284 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15285 return all_equal(Shuf->getShuffleMask());
15286 return false;
15287}
15288
15289/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15290/// or upper half of the vector elements.
15291static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15292 bool AllowSplat = false) {
15293 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15294 auto *FullTy = FullV->getType();
15295 auto *HalfTy = HalfV->getType();
15296 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15297 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15298 };
15299
15300 auto extractHalf = [](Value *FullV, Value *HalfV) {
15301 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15302 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15303 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15304 };
15305
15306 ArrayRef<int> M1, M2;
15307 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15308 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15309 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15310 return false;
15311
15312 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15313 // it is not checked as an extract below.
15314 if (AllowSplat && isSplatShuffle(Op1))
15315 S1Op1 = nullptr;
15316 if (AllowSplat && isSplatShuffle(Op2))
15317 S2Op1 = nullptr;
15318
15319 // Check that the operands are half as wide as the result and we extract
15320 // half of the elements of the input vectors.
15321 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15322 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15323 return false;
15324
15325 // Check the mask extracts either the lower or upper half of vector
15326 // elements.
15327 int M1Start = 0;
15328 int M2Start = 0;
15329 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15330 if ((S1Op1 &&
15331 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15332 (S2Op1 &&
15333 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15334 return false;
15335
15336 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15337 (M2Start != 0 && M2Start != (NumElements / 2)))
15338 return false;
15339 if (S1Op1 && S2Op1 && M1Start != M2Start)
15340 return false;
15341
15342 return true;
15343}
15344
15345/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15346/// of the vector elements.
15347static bool areExtractExts(Value *Ext1, Value *Ext2) {
15348 auto areExtDoubled = [](Instruction *Ext) {
15349 return Ext->getType()->getScalarSizeInBits() ==
15350 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15351 };
15352
15353 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15354 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15355 !areExtDoubled(cast<Instruction>(Ext1)) ||
15356 !areExtDoubled(cast<Instruction>(Ext2)))
15357 return false;
15358
15359 return true;
15360}
15361
15362/// Check if Op could be used with vmull_high_p64 intrinsic.
15364 Value *VectorOperand = nullptr;
15365 ConstantInt *ElementIndex = nullptr;
15366 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15367 m_ConstantInt(ElementIndex))) &&
15368 ElementIndex->getValue() == 1 &&
15369 isa<FixedVectorType>(VectorOperand->getType()) &&
15370 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15371}
15372
15373/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15374static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15376}
15377
15379 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15380 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15381 if (!GEP || GEP->getNumOperands() != 2)
15382 return false;
15383
15384 Value *Base = GEP->getOperand(0);
15385 Value *Offsets = GEP->getOperand(1);
15386
15387 // We only care about scalar_base+vector_offsets.
15388 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15389 return false;
15390
15391 // Sink extends that would allow us to use 32-bit offset vectors.
15392 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15393 auto *OffsetsInst = cast<Instruction>(Offsets);
15394 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15395 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15396 Ops.push_back(&GEP->getOperandUse(1));
15397 }
15398
15399 // Sink the GEP.
15400 return true;
15401}
15402
15403/// We want to sink following cases:
15404/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15406 if (match(Op, m_VScale()))
15407 return true;
15408 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15410 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15411 return true;
15412 }
15413 return false;
15414}
15415
15416/// Check if sinking \p I's operands to I's basic block is profitable, because
15417/// the operands can be folded into a target instruction, e.g.
15418/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15420 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15421 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15422 switch (II->getIntrinsicID()) {
15423 case Intrinsic::aarch64_neon_smull:
15424 case Intrinsic::aarch64_neon_umull:
15425 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15426 /*AllowSplat=*/true)) {
15427 Ops.push_back(&II->getOperandUse(0));
15428 Ops.push_back(&II->getOperandUse(1));
15429 return true;
15430 }
15431 [[fallthrough]];
15432
15433 case Intrinsic::fma:
15434 if (isa<VectorType>(I->getType()) &&
15435 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15436 !Subtarget->hasFullFP16())
15437 return false;
15438 [[fallthrough]];
15439 case Intrinsic::aarch64_neon_sqdmull:
15440 case Intrinsic::aarch64_neon_sqdmulh:
15441 case Intrinsic::aarch64_neon_sqrdmulh:
15442 // Sink splats for index lane variants
15443 if (isSplatShuffle(II->getOperand(0)))
15444 Ops.push_back(&II->getOperandUse(0));
15445 if (isSplatShuffle(II->getOperand(1)))
15446 Ops.push_back(&II->getOperandUse(1));
15447 return !Ops.empty();
15448 case Intrinsic::aarch64_neon_fmlal:
15449 case Intrinsic::aarch64_neon_fmlal2:
15450 case Intrinsic::aarch64_neon_fmlsl:
15451 case Intrinsic::aarch64_neon_fmlsl2:
15452 // Sink splats for index lane variants
15453 if (isSplatShuffle(II->getOperand(1)))
15454 Ops.push_back(&II->getOperandUse(1));
15455 if (isSplatShuffle(II->getOperand(2)))
15456 Ops.push_back(&II->getOperandUse(2));
15457 return !Ops.empty();
15458 case Intrinsic::aarch64_sve_ptest_first:
15459 case Intrinsic::aarch64_sve_ptest_last:
15460 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15461 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15462 Ops.push_back(&II->getOperandUse(0));
15463 return !Ops.empty();
15464 case Intrinsic::aarch64_sme_write_horiz:
15465 case Intrinsic::aarch64_sme_write_vert:
15466 case Intrinsic::aarch64_sme_writeq_horiz:
15467 case Intrinsic::aarch64_sme_writeq_vert: {
15468 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15469 if (!Idx || Idx->getOpcode() != Instruction::Add)
15470 return false;
15471 Ops.push_back(&II->getOperandUse(1));
15472 return true;
15473 }
15474 case Intrinsic::aarch64_sme_read_horiz:
15475 case Intrinsic::aarch64_sme_read_vert:
15476 case Intrinsic::aarch64_sme_readq_horiz:
15477 case Intrinsic::aarch64_sme_readq_vert:
15478 case Intrinsic::aarch64_sme_ld1b_vert:
15479 case Intrinsic::aarch64_sme_ld1h_vert:
15480 case Intrinsic::aarch64_sme_ld1w_vert:
15481 case Intrinsic::aarch64_sme_ld1d_vert:
15482 case Intrinsic::aarch64_sme_ld1q_vert:
15483 case Intrinsic::aarch64_sme_st1b_vert:
15484 case Intrinsic::aarch64_sme_st1h_vert:
15485 case Intrinsic::aarch64_sme_st1w_vert:
15486 case Intrinsic::aarch64_sme_st1d_vert:
15487 case Intrinsic::aarch64_sme_st1q_vert:
15488 case Intrinsic::aarch64_sme_ld1b_horiz:
15489 case Intrinsic::aarch64_sme_ld1h_horiz:
15490 case Intrinsic::aarch64_sme_ld1w_horiz:
15491 case Intrinsic::aarch64_sme_ld1d_horiz:
15492 case Intrinsic::aarch64_sme_ld1q_horiz:
15493 case Intrinsic::aarch64_sme_st1b_horiz:
15494 case Intrinsic::aarch64_sme_st1h_horiz:
15495 case Intrinsic::aarch64_sme_st1w_horiz:
15496 case Intrinsic::aarch64_sme_st1d_horiz:
15497 case Intrinsic::aarch64_sme_st1q_horiz: {
15498 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15499 if (!Idx || Idx->getOpcode() != Instruction::Add)
15500 return false;
15501 Ops.push_back(&II->getOperandUse(3));
15502 return true;
15503 }
15504 case Intrinsic::aarch64_neon_pmull:
15505 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15506 return false;
15507 Ops.push_back(&II->getOperandUse(0));
15508 Ops.push_back(&II->getOperandUse(1));
15509 return true;
15510 case Intrinsic::aarch64_neon_pmull64:
15511 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15512 II->getArgOperand(1)))
15513 return false;
15514 Ops.push_back(&II->getArgOperandUse(0));
15515 Ops.push_back(&II->getArgOperandUse(1));
15516 return true;
15517 case Intrinsic::masked_gather:
15518 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15519 return false;
15520 Ops.push_back(&II->getArgOperandUse(0));
15521 return true;
15522 case Intrinsic::masked_scatter:
15523 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15524 return false;
15525 Ops.push_back(&II->getArgOperandUse(1));
15526 return true;
15527 default:
15528 return false;
15529 }
15530 }
15531
15532 // Sink vscales closer to uses for better isel
15533 switch (I->getOpcode()) {
15534 case Instruction::GetElementPtr:
15535 case Instruction::Add:
15536 case Instruction::Sub:
15537 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15538 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15539 Ops.push_back(&I->getOperandUse(Op));
15540 return true;
15541 }
15542 }
15543 break;
15544 default:
15545 break;
15546 }
15547
15548 if (!I->getType()->isVectorTy())
15549 return false;
15550
15551 switch (I->getOpcode()) {
15552 case Instruction::Sub:
15553 case Instruction::Add: {
15554 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15555 return false;
15556
15557 // If the exts' operands extract either the lower or upper elements, we
15558 // can sink them too.
15559 auto Ext1 = cast<Instruction>(I->getOperand(0));
15560 auto Ext2 = cast<Instruction>(I->getOperand(1));
15561 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15562 Ops.push_back(&Ext1->getOperandUse(0));
15563 Ops.push_back(&Ext2->getOperandUse(0));
15564 }
15565
15566 Ops.push_back(&I->getOperandUse(0));
15567 Ops.push_back(&I->getOperandUse(1));
15568
15569 return true;
15570 }
15571 case Instruction::Or: {
15572 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15573 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15574 if (Subtarget->hasNEON()) {
15575 Instruction *OtherAnd, *IA, *IB;
15576 Value *MaskValue;
15577 // MainAnd refers to And instruction that has 'Not' as one of its operands
15578 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15579 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15580 m_Instruction(IA)))))) {
15581 if (match(OtherAnd,
15582 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15583 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15584 ? cast<Instruction>(I->getOperand(1))
15585 : cast<Instruction>(I->getOperand(0));
15586
15587 // Both Ands should be in same basic block as Or
15588 if (I->getParent() != MainAnd->getParent() ||
15589 I->getParent() != OtherAnd->getParent())
15590 return false;
15591
15592 // Non-mask operands of both Ands should also be in same basic block
15593 if (I->getParent() != IA->getParent() ||
15594 I->getParent() != IB->getParent())
15595 return false;
15596
15597 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15598 Ops.push_back(&I->getOperandUse(0));
15599 Ops.push_back(&I->getOperandUse(1));
15600
15601 return true;
15602 }
15603 }
15604 }
15605
15606 return false;
15607 }
15608 case Instruction::Mul: {
15609 int NumZExts = 0, NumSExts = 0;
15610 for (auto &Op : I->operands()) {
15611 // Make sure we are not already sinking this operand
15612 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15613 continue;
15614
15615 if (match(&Op, m_SExt(m_Value()))) {
15616 NumSExts++;
15617 continue;
15618 } else if (match(&Op, m_ZExt(m_Value()))) {
15619 NumZExts++;
15620 continue;
15621 }
15622
15623 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15624
15625 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15626 // operand and the s/zext can help create indexed s/umull. This is
15627 // especially useful to prevent i64 mul being scalarized.
15628 if (Shuffle && isSplatShuffle(Shuffle) &&
15629 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15630 Ops.push_back(&Shuffle->getOperandUse(0));
15631 Ops.push_back(&Op);
15632 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15633 NumSExts++;
15634 else
15635 NumZExts++;
15636 continue;
15637 }
15638
15639 if (!Shuffle)
15640 continue;
15641
15642 Value *ShuffleOperand = Shuffle->getOperand(0);
15643 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15644 if (!Insert)
15645 continue;
15646
15647 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15648 if (!OperandInstr)
15649 continue;
15650
15651 ConstantInt *ElementConstant =
15652 dyn_cast<ConstantInt>(Insert->getOperand(2));
15653 // Check that the insertelement is inserting into element 0
15654 if (!ElementConstant || !ElementConstant->isZero())
15655 continue;
15656
15657 unsigned Opcode = OperandInstr->getOpcode();
15658 if (Opcode == Instruction::SExt)
15659 NumSExts++;
15660 else if (Opcode == Instruction::ZExt)
15661 NumZExts++;
15662 else {
15663 // If we find that the top bits are known 0, then we can sink and allow
15664 // the backend to generate a umull.
15665 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15666 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15667 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15668 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15669 continue;
15670 NumZExts++;
15671 }
15672
15673 Ops.push_back(&Shuffle->getOperandUse(0));
15674 Ops.push_back(&Op);
15675 }
15676
15677 // Is it profitable to sink if we found two of the same type of extends.
15678 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15679 }
15680 default:
15681 return false;
15682 }
15683 return false;
15684}
15685
15687 bool IsLittleEndian) {
15688 Value *Op = ZExt->getOperand(0);
15689 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15690 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15691 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15692 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15693 return false;
15694
15695 assert(DstWidth % SrcWidth == 0 &&
15696 "TBL lowering is not supported for a ZExt instruction with this "
15697 "source & destination element type.");
15698 unsigned ZExtFactor = DstWidth / SrcWidth;
15699 unsigned NumElts = SrcTy->getNumElements();
15700 IRBuilder<> Builder(ZExt);
15701 SmallVector<int> Mask;
15702 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15703 // vector to replace the original ZExt. This can later be lowered to a set of
15704 // tbl instructions.
15705 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15706 if (IsLittleEndian) {
15707 if (i % ZExtFactor == 0)
15708 Mask.push_back(i / ZExtFactor);
15709 else
15710 Mask.push_back(NumElts);
15711 } else {
15712 if ((i + 1) % ZExtFactor == 0)
15713 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15714 else
15715 Mask.push_back(NumElts);
15716 }
15717 }
15718
15719 auto *FirstEltZero = Builder.CreateInsertElement(
15720 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15721 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15722 Result = Builder.CreateBitCast(Result, DstTy);
15723 if (DstTy != ZExt->getType())
15724 Result = Builder.CreateZExt(Result, ZExt->getType());
15725 ZExt->replaceAllUsesWith(Result);
15726 ZExt->eraseFromParent();
15727 return true;
15728}
15729
15730static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15731 IRBuilder<> Builder(TI);
15733 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15734 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15735 auto *DstTy = cast<FixedVectorType>(TI->getType());
15736 assert(SrcTy->getElementType()->isIntegerTy() &&
15737 "Non-integer type source vector element is not supported");
15738 assert(DstTy->getElementType()->isIntegerTy(8) &&
15739 "Unsupported destination vector element type");
15740 unsigned SrcElemTySz =
15741 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15742 unsigned DstElemTySz =
15743 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15744 assert((SrcElemTySz % DstElemTySz == 0) &&
15745 "Cannot lower truncate to tbl instructions for a source element size "
15746 "that is not divisible by the destination element size");
15747 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15748 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15749 "Unsupported source vector element type size");
15750 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15751
15752 // Create a mask to choose every nth byte from the source vector table of
15753 // bytes to create the truncated destination vector, where 'n' is the truncate
15754 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15755 // 0,8,16,..Y*8th bytes for the little-endian format
15757 for (int Itr = 0; Itr < 16; Itr++) {
15758 if (Itr < NumElements)
15759 MaskConst.push_back(Builder.getInt8(
15760 IsLittleEndian ? Itr * TruncFactor
15761 : Itr * TruncFactor + (TruncFactor - 1)));
15762 else
15763 MaskConst.push_back(Builder.getInt8(255));
15764 }
15765
15766 int MaxTblSz = 128 * 4;
15767 int MaxSrcSz = SrcElemTySz * NumElements;
15768 int ElemsPerTbl =
15769 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15770 assert(ElemsPerTbl <= 16 &&
15771 "Maximum elements selected using TBL instruction cannot exceed 16!");
15772
15773 int ShuffleCount = 128 / SrcElemTySz;
15774 SmallVector<int> ShuffleLanes;
15775 for (int i = 0; i < ShuffleCount; ++i)
15776 ShuffleLanes.push_back(i);
15777
15778 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15779 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15780 // call TBL & save the result in a vector of TBL results for combining later.
15782 while (ShuffleLanes.back() < NumElements) {
15783 Parts.push_back(Builder.CreateBitCast(
15784 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15785
15786 if (Parts.size() == 4) {
15788 Intrinsic::aarch64_neon_tbl4, VecTy);
15789 Parts.push_back(ConstantVector::get(MaskConst));
15790 Results.push_back(Builder.CreateCall(F, Parts));
15791 Parts.clear();
15792 }
15793
15794 for (int i = 0; i < ShuffleCount; ++i)
15795 ShuffleLanes[i] += ShuffleCount;
15796 }
15797
15798 assert((Parts.empty() || Results.empty()) &&
15799 "Lowering trunc for vectors requiring different TBL instructions is "
15800 "not supported!");
15801 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15802 // registers
15803 if (!Parts.empty()) {
15804 Intrinsic::ID TblID;
15805 switch (Parts.size()) {
15806 case 1:
15807 TblID = Intrinsic::aarch64_neon_tbl1;
15808 break;
15809 case 2:
15810 TblID = Intrinsic::aarch64_neon_tbl2;
15811 break;
15812 case 3:
15813 TblID = Intrinsic::aarch64_neon_tbl3;
15814 break;
15815 }
15816
15817 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15818 Parts.push_back(ConstantVector::get(MaskConst));
15819 Results.push_back(Builder.CreateCall(F, Parts));
15820 }
15821
15822 // Extract the destination vector from TBL result(s) after combining them
15823 // where applicable. Currently, at most two TBLs are supported.
15824 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15825 "more than 2 tbl instructions!");
15826 Value *FinalResult = Results[0];
15827 if (Results.size() == 1) {
15828 if (ElemsPerTbl < 16) {
15829 SmallVector<int> FinalMask(ElemsPerTbl);
15830 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15831 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15832 }
15833 } else {
15834 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15835 if (ElemsPerTbl < 16) {
15836 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15837 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15838 } else {
15839 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15840 }
15841 FinalResult =
15842 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15843 }
15844
15845 TI->replaceAllUsesWith(FinalResult);
15846 TI->eraseFromParent();
15847}
15848
15850 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15851 // shuffle_vector instructions are serialized when targeting SVE,
15852 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15853 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15854 return false;
15855
15856 // Try to optimize conversions using tbl. This requires materializing constant
15857 // index vectors, which can increase code size and add loads. Skip the
15858 // transform unless the conversion is in a loop block guaranteed to execute
15859 // and we are not optimizing for size.
15860 Function *F = I->getParent()->getParent();
15861 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15862 F->hasOptSize())
15863 return false;
15864
15865 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15866 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15867 if (!SrcTy || !DstTy)
15868 return false;
15869
15870 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15871 // lowered to tbl instructions to insert the original i8 elements
15872 // into i8x lanes. This is enabled for cases where it is beneficial.
15873 auto *ZExt = dyn_cast<ZExtInst>(I);
15874 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15875 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15876 if (DstWidth % 8 != 0)
15877 return false;
15878
15879 auto *TruncDstType =
15880 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15881 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15882 // the remaining ZExt folded into the user, don't use tbl lowering.
15883 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15884 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15887 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15888 return false;
15889
15890 DstTy = TruncDstType;
15891 }
15892
15893 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15894 }
15895
15896 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15897 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15898 DstTy->getElementType()->isFloatTy()) {
15899 IRBuilder<> Builder(I);
15900 auto *ZExt = cast<ZExtInst>(
15901 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15902 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15903 I->replaceAllUsesWith(UI);
15904 I->eraseFromParent();
15905 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15906 Subtarget->isLittleEndian());
15907 }
15908
15909 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15910 // followed by a truncate lowered to using tbl.4.
15911 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15912 if (FPToUI &&
15913 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15914 SrcTy->getElementType()->isFloatTy() &&
15915 DstTy->getElementType()->isIntegerTy(8)) {
15916 IRBuilder<> Builder(I);
15917 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15918 VectorType::getInteger(SrcTy));
15919 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15920 I->replaceAllUsesWith(TruncI);
15921 I->eraseFromParent();
15922 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15923 return true;
15924 }
15925
15926 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15927 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15928 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15929 // registers
15930 auto *TI = dyn_cast<TruncInst>(I);
15931 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15932 ((SrcTy->getElementType()->isIntegerTy(32) ||
15933 SrcTy->getElementType()->isIntegerTy(64)) &&
15934 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15935 createTblForTrunc(TI, Subtarget->isLittleEndian());
15936 return true;
15937 }
15938
15939 return false;
15940}
15941
15943 Align &RequiredAligment) const {
15944 if (!LoadedType.isSimple() ||
15945 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15946 return false;
15947 // Cyclone supports unaligned accesses.
15948 RequiredAligment = Align(1);
15949 unsigned NumBits = LoadedType.getSizeInBits();
15950 return NumBits == 32 || NumBits == 64;
15951}
15952
15953/// A helper function for determining the number of interleaved accesses we
15954/// will generate when lowering accesses of the given type.
15956 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15957 unsigned VecSize = 128;
15958 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15959 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15960 if (UseScalable)
15961 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15962 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15963}
15964
15967 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15968 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15969 return MOStridedAccess;
15971}
15972
15974 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15975 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15976 auto EC = VecTy->getElementCount();
15977 unsigned MinElts = EC.getKnownMinValue();
15978
15979 UseScalable = false;
15980
15981 if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15982 return false;
15983
15984 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15985 return false;
15986
15987 // Ensure that the predicate for this number of elements is available.
15988 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15989 return false;
15990
15991 // Ensure the number of vector elements is greater than 1.
15992 if (MinElts < 2)
15993 return false;
15994
15995 // Ensure the element type is legal.
15996 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15997 return false;
15998
15999 if (EC.isScalable()) {
16000 UseScalable = true;
16001 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
16002 }
16003
16004 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16005 if (!Subtarget->isNeonAvailable() ||
16006 (Subtarget->useSVEForFixedLengthVectors() &&
16007 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
16008 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
16009 isPowerOf2_32(MinElts) && VecSize > 128)))) {
16010 UseScalable = true;
16011 return true;
16012 }
16013
16014 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16015 // 128 will be split into multiple interleaved accesses.
16016 return VecSize == 64 || VecSize % 128 == 0;
16017}
16018
16020 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16021 return ScalableVectorType::get(VTy->getElementType(), 2);
16022
16023 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16024 return ScalableVectorType::get(VTy->getElementType(), 4);
16025
16026 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16027 return ScalableVectorType::get(VTy->getElementType(), 8);
16028
16029 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16030 return ScalableVectorType::get(VTy->getElementType(), 8);
16031
16032 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16033 return ScalableVectorType::get(VTy->getElementType(), 2);
16034
16035 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16036 return ScalableVectorType::get(VTy->getElementType(), 4);
16037
16038 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16039 return ScalableVectorType::get(VTy->getElementType(), 8);
16040
16041 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16042 return ScalableVectorType::get(VTy->getElementType(), 16);
16043
16044 llvm_unreachable("Cannot handle input vector type");
16045}
16046
16047static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16048 bool Scalable, Type *LDVTy,
16049 Type *PtrTy) {
16050 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16051 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16052 Intrinsic::aarch64_sve_ld3_sret,
16053 Intrinsic::aarch64_sve_ld4_sret};
16054 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16055 Intrinsic::aarch64_neon_ld3,
16056 Intrinsic::aarch64_neon_ld4};
16057 if (Scalable)
16058 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16059
16060 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16061}
16062
16063static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16064 bool Scalable, Type *STVTy,
16065 Type *PtrTy) {
16066 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16067 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16068 Intrinsic::aarch64_sve_st3,
16069 Intrinsic::aarch64_sve_st4};
16070 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16071 Intrinsic::aarch64_neon_st3,
16072 Intrinsic::aarch64_neon_st4};
16073 if (Scalable)
16074 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16075
16076 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16077}
16078
16079/// Lower an interleaved load into a ldN intrinsic.
16080///
16081/// E.g. Lower an interleaved load (Factor = 2):
16082/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16083/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16084/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16085///
16086/// Into:
16087/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16088/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16089/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16092 ArrayRef<unsigned> Indices, unsigned Factor) const {
16093 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16094 "Invalid interleave factor");
16095 assert(!Shuffles.empty() && "Empty shufflevector input");
16096 assert(Shuffles.size() == Indices.size() &&
16097 "Unmatched number of shufflevectors and indices");
16098
16099 const DataLayout &DL = LI->getModule()->getDataLayout();
16100
16101 VectorType *VTy = Shuffles[0]->getType();
16102
16103 // Skip if we do not have NEON and skip illegal vector types. We can
16104 // "legalize" wide vector types into multiple interleaved accesses as long as
16105 // the vector types are divisible by 128.
16106 bool UseScalable;
16107 if (!Subtarget->hasNEON() ||
16108 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
16109 return false;
16110
16111 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16112
16113 auto *FVTy = cast<FixedVectorType>(VTy);
16114
16115 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16116 // load integer vectors first and then convert to pointer vectors.
16117 Type *EltTy = FVTy->getElementType();
16118 if (EltTy->isPointerTy())
16119 FVTy =
16120 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16121
16122 // If we're going to generate more than one load, reset the sub-vector type
16123 // to something legal.
16124 FVTy = FixedVectorType::get(FVTy->getElementType(),
16125 FVTy->getNumElements() / NumLoads);
16126
16127 auto *LDVTy =
16128 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16129
16130 IRBuilder<> Builder(LI);
16131
16132 // The base address of the load.
16133 Value *BaseAddr = LI->getPointerOperand();
16134
16135 Type *PtrTy = LI->getPointerOperandType();
16136 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16137 LDVTy->getElementCount());
16138
16139 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16140 UseScalable, LDVTy, PtrTy);
16141
16142 // Holds sub-vectors extracted from the load intrinsic return values. The
16143 // sub-vectors are associated with the shufflevector instructions they will
16144 // replace.
16146
16147 Value *PTrue = nullptr;
16148 if (UseScalable) {
16149 std::optional<unsigned> PgPattern =
16150 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16151 if (Subtarget->getMinSVEVectorSizeInBits() ==
16152 Subtarget->getMaxSVEVectorSizeInBits() &&
16153 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16154 PgPattern = AArch64SVEPredPattern::all;
16155
16156 auto *PTruePat =
16157 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16158 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16159 {PTruePat});
16160 }
16161
16162 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16163
16164 // If we're generating more than one load, compute the base address of
16165 // subsequent loads as an offset from the previous.
16166 if (LoadCount > 0)
16167 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16168 FVTy->getNumElements() * Factor);
16169
16170 CallInst *LdN;
16171 if (UseScalable)
16172 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16173 else
16174 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16175
16176 // Extract and store the sub-vectors returned by the load intrinsic.
16177 for (unsigned i = 0; i < Shuffles.size(); i++) {
16178 ShuffleVectorInst *SVI = Shuffles[i];
16179 unsigned Index = Indices[i];
16180
16181 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16182
16183 if (UseScalable)
16184 SubVec = Builder.CreateExtractVector(
16185 FVTy, SubVec,
16186 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16187
16188 // Convert the integer vector to pointer vector if the element is pointer.
16189 if (EltTy->isPointerTy())
16190 SubVec = Builder.CreateIntToPtr(
16192 FVTy->getNumElements()));
16193
16194 SubVecs[SVI].push_back(SubVec);
16195 }
16196 }
16197
16198 // Replace uses of the shufflevector instructions with the sub-vectors
16199 // returned by the load intrinsic. If a shufflevector instruction is
16200 // associated with more than one sub-vector, those sub-vectors will be
16201 // concatenated into a single wide vector.
16202 for (ShuffleVectorInst *SVI : Shuffles) {
16203 auto &SubVec = SubVecs[SVI];
16204 auto *WideVec =
16205 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16206 SVI->replaceAllUsesWith(WideVec);
16207 }
16208
16209 return true;
16210}
16211
16212template <typename Iter>
16213bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16214 int MaxLookupDist = 20;
16215 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16216 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16217 const Value *PtrA1 =
16218 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16219
16220 while (++It != End) {
16221 if (It->isDebugOrPseudoInst())
16222 continue;
16223 if (MaxLookupDist-- == 0)
16224 break;
16225 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16226 const Value *PtrB1 =
16227 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16228 DL, OffsetB);
16229 if (PtrA1 == PtrB1 &&
16230 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16231 .abs() == 16)
16232 return true;
16233 }
16234 }
16235
16236 return false;
16237}
16238
16239/// Lower an interleaved store into a stN intrinsic.
16240///
16241/// E.g. Lower an interleaved store (Factor = 3):
16242/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16243/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16244/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16245///
16246/// Into:
16247/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16248/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16249/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16250/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16251///
16252/// Note that the new shufflevectors will be removed and we'll only generate one
16253/// st3 instruction in CodeGen.
16254///
16255/// Example for a more general valid mask (Factor 3). Lower:
16256/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16257/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16258/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16259///
16260/// Into:
16261/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16262/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16263/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16264/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16266 ShuffleVectorInst *SVI,
16267 unsigned Factor) const {
16268
16269 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16270 "Invalid interleave factor");
16271
16272 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16273 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16274
16275 unsigned LaneLen = VecTy->getNumElements() / Factor;
16276 Type *EltTy = VecTy->getElementType();
16277 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16278
16279 const DataLayout &DL = SI->getModule()->getDataLayout();
16280 bool UseScalable;
16281
16282 // Skip if we do not have NEON and skip illegal vector types. We can
16283 // "legalize" wide vector types into multiple interleaved accesses as long as
16284 // the vector types are divisible by 128.
16285 if (!Subtarget->hasNEON() ||
16286 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16287 return false;
16288
16289 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16290
16291 Value *Op0 = SVI->getOperand(0);
16292 Value *Op1 = SVI->getOperand(1);
16293 IRBuilder<> Builder(SI);
16294
16295 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16296 // vectors to integer vectors.
16297 if (EltTy->isPointerTy()) {
16298 Type *IntTy = DL.getIntPtrType(EltTy);
16299 unsigned NumOpElts =
16300 cast<FixedVectorType>(Op0->getType())->getNumElements();
16301
16302 // Convert to the corresponding integer vector.
16303 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16304 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16305 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16306
16307 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16308 }
16309
16310 // If we're going to generate more than one store, reset the lane length
16311 // and sub-vector type to something legal.
16312 LaneLen /= NumStores;
16313 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16314
16315 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16316 : SubVecTy;
16317
16318 // The base address of the store.
16319 Value *BaseAddr = SI->getPointerOperand();
16320
16321 auto Mask = SVI->getShuffleMask();
16322
16323 // Sanity check if all the indices are NOT in range.
16324 // If mask is `poison`, `Mask` may be a vector of -1s.
16325 // If all of them are `poison`, OOB read will happen later.
16326 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16327 return false;
16328 }
16329 // A 64bit st2 which does not start at element 0 will involved adding extra
16330 // ext elements making the st2 unprofitable, and if there is a nearby store
16331 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16332 // zip;ldp pair which has higher throughput.
16333 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16334 (Mask[0] != 0 ||
16335 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16336 DL) ||
16337 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16338 BaseAddr, DL)))
16339 return false;
16340
16341 Type *PtrTy = SI->getPointerOperandType();
16342 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16343 STVTy->getElementCount());
16344
16345 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16346 UseScalable, STVTy, PtrTy);
16347
16348 Value *PTrue = nullptr;
16349 if (UseScalable) {
16350 std::optional<unsigned> PgPattern =
16351 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16352 if (Subtarget->getMinSVEVectorSizeInBits() ==
16353 Subtarget->getMaxSVEVectorSizeInBits() &&
16354 Subtarget->getMinSVEVectorSizeInBits() ==
16355 DL.getTypeSizeInBits(SubVecTy))
16356 PgPattern = AArch64SVEPredPattern::all;
16357
16358 auto *PTruePat =
16359 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16360 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16361 {PTruePat});
16362 }
16363
16364 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16365
16367
16368 // Split the shufflevector operands into sub vectors for the new stN call.
16369 for (unsigned i = 0; i < Factor; i++) {
16370 Value *Shuffle;
16371 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16372 if (Mask[IdxI] >= 0) {
16373 Shuffle = Builder.CreateShuffleVector(
16374 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16375 } else {
16376 unsigned StartMask = 0;
16377 for (unsigned j = 1; j < LaneLen; j++) {
16378 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16379 if (Mask[IdxJ] >= 0) {
16380 StartMask = Mask[IdxJ] - j;
16381 break;
16382 }
16383 }
16384 // Note: Filling undef gaps with random elements is ok, since
16385 // those elements were being written anyway (with undefs).
16386 // In the case of all undefs we're defaulting to using elems from 0
16387 // Note: StartMask cannot be negative, it's checked in
16388 // isReInterleaveMask
16389 Shuffle = Builder.CreateShuffleVector(
16390 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16391 }
16392
16393 if (UseScalable)
16394 Shuffle = Builder.CreateInsertVector(
16395 STVTy, UndefValue::get(STVTy), Shuffle,
16396 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16397
16398 Ops.push_back(Shuffle);
16399 }
16400
16401 if (UseScalable)
16402 Ops.push_back(PTrue);
16403
16404 // If we generating more than one store, we compute the base address of
16405 // subsequent stores as an offset from the previous.
16406 if (StoreCount > 0)
16407 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16408 BaseAddr, LaneLen * Factor);
16409
16410 Ops.push_back(BaseAddr);
16411 Builder.CreateCall(StNFunc, Ops);
16412 }
16413 return true;
16414}
16415
16417 IntrinsicInst *DI, LoadInst *LI) const {
16418 // Only deinterleave2 supported at present.
16419 if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
16420 return false;
16421
16422 // Only a factor of 2 supported at present.
16423 const unsigned Factor = 2;
16424
16425 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16426 const DataLayout &DL = DI->getModule()->getDataLayout();
16427 bool UseScalable;
16428 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16429 return false;
16430
16431 // TODO: Add support for using SVE instructions with fixed types later, using
16432 // the code from lowerInterleavedLoad to obtain the correct container type.
16433 if (UseScalable && !VTy->isScalableTy())
16434 return false;
16435
16436 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16437
16438 VectorType *LdTy =
16440 VTy->getElementCount().divideCoefficientBy(NumLoads));
16441
16442 Type *PtrTy = LI->getPointerOperandType();
16443 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16444 UseScalable, LdTy, PtrTy);
16445
16446 IRBuilder<> Builder(LI);
16447
16448 Value *Pred = nullptr;
16449 if (UseScalable)
16450 Pred =
16451 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16452
16453 Value *BaseAddr = LI->getPointerOperand();
16454 Value *Result;
16455 if (NumLoads > 1) {
16456 Value *Left = PoisonValue::get(VTy);
16458
16459 for (unsigned I = 0; I < NumLoads; ++I) {
16460 Value *Offset = Builder.getInt64(I * Factor);
16461
16462 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16463 Value *LdN = nullptr;
16464 if (UseScalable)
16465 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16466 else
16467 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16468
16469 Value *Idx =
16470 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16471 Left = Builder.CreateInsertVector(
16472 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16473 Right = Builder.CreateInsertVector(
16474 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16475 }
16476
16477 Result = PoisonValue::get(DI->getType());
16478 Result = Builder.CreateInsertValue(Result, Left, 0);
16479 Result = Builder.CreateInsertValue(Result, Right, 1);
16480 } else {
16481 if (UseScalable)
16482 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16483 else
16484 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16485 }
16486
16487 DI->replaceAllUsesWith(Result);
16488 return true;
16489}
16490
16492 IntrinsicInst *II, StoreInst *SI) const {
16493 // Only interleave2 supported at present.
16494 if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16495 return false;
16496
16497 // Only a factor of 2 supported at present.
16498 const unsigned Factor = 2;
16499
16500 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16501 const DataLayout &DL = II->getModule()->getDataLayout();
16502 bool UseScalable;
16503 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16504 return false;
16505
16506 // TODO: Add support for using SVE instructions with fixed types later, using
16507 // the code from lowerInterleavedStore to obtain the correct container type.
16508 if (UseScalable && !VTy->isScalableTy())
16509 return false;
16510
16511 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16512
16513 VectorType *StTy =
16515 VTy->getElementCount().divideCoefficientBy(NumStores));
16516
16517 Type *PtrTy = SI->getPointerOperandType();
16518 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16519 UseScalable, StTy, PtrTy);
16520
16521 IRBuilder<> Builder(SI);
16522
16523 Value *BaseAddr = SI->getPointerOperand();
16524 Value *Pred = nullptr;
16525
16526 if (UseScalable)
16527 Pred =
16528 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16529
16530 Value *L = II->getOperand(0);
16531 Value *R = II->getOperand(1);
16532
16533 for (unsigned I = 0; I < NumStores; ++I) {
16534 Value *Address = BaseAddr;
16535 if (NumStores > 1) {
16536 Value *Offset = Builder.getInt64(I * Factor);
16537 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16538
16539 Value *Idx =
16540 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16541 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16542 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16543 }
16544
16545 if (UseScalable)
16546 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16547 else
16548 Builder.CreateCall(StNFunc, {L, R, Address});
16549 }
16550
16551 return true;
16552}
16553
16555 const MemOp &Op, const AttributeList &FuncAttributes) const {
16556 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16557 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16558 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16559 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16560 // taken one instruction to materialize the v2i64 zero and one store (with
16561 // restrictive addressing mode). Just do i64 stores.
16562 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16563 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16564 if (Op.isAligned(AlignCheck))
16565 return true;
16566 unsigned Fast;
16567 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16569 Fast;
16570 };
16571
16572 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16573 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16574 return MVT::v16i8;
16575 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16576 return MVT::f128;
16577 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16578 return MVT::i64;
16579 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16580 return MVT::i32;
16581 return MVT::Other;
16582}
16583
16585 const MemOp &Op, const AttributeList &FuncAttributes) const {
16586 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16587 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16588 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16589 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16590 // taken one instruction to materialize the v2i64 zero and one store (with
16591 // restrictive addressing mode). Just do i64 stores.
16592 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16593 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16594 if (Op.isAligned(AlignCheck))
16595 return true;
16596 unsigned Fast;
16597 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16599 Fast;
16600 };
16601
16602 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16603 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16604 return LLT::fixed_vector(2, 64);
16605 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16606 return LLT::scalar(128);
16607 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16608 return LLT::scalar(64);
16609 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16610 return LLT::scalar(32);
16611 return LLT();
16612}
16613
16614// 12-bit optionally shifted immediates are legal for adds.
16616 if (Immed == std::numeric_limits<int64_t>::min()) {
16617 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16618 << ": avoid UB for INT64_MIN\n");
16619 return false;
16620 }
16621 // Same encoding for add/sub, just flip the sign.
16622 Immed = std::abs(Immed);
16623 bool IsLegal = ((Immed >> 12) == 0 ||
16624 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16625 LLVM_DEBUG(dbgs() << "Is " << Immed
16626 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16627 return IsLegal;
16628}
16629
16631 // We will only emit addvl/inc* instructions for SVE2
16632 if (!Subtarget->hasSVE2())
16633 return false;
16634
16635 // addvl's immediates are in terms of the number of bytes in a register.
16636 // Since there are 16 in the base supported size (128bits), we need to
16637 // divide the immediate by that much to give us a useful immediate to
16638 // multiply by vscale. We can't have a remainder as a result of this.
16639 if (Imm % 16 == 0)
16640 return isInt<6>(Imm / 16);
16641
16642 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16643 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16644 // of addvl as a result, so only take h|w|d into account.
16645 // Dec[h|w|d] will cover subtractions.
16646 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16647 // FIXME: Can we make use of other patterns to cover other immediates?
16648
16649 // inch|dech
16650 if (Imm % 8 == 0)
16651 return std::labs(Imm / 8) <= 16;
16652 // incw|decw
16653 if (Imm % 4 == 0)
16654 return std::labs(Imm / 4) <= 16;
16655 // incd|decd
16656 if (Imm % 2 == 0)
16657 return std::labs(Imm / 2) <= 16;
16658
16659 return false;
16660}
16661
16662// Return false to prevent folding
16663// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16664// if the folding leads to worse code.
16666 SDValue AddNode, SDValue ConstNode) const {
16667 // Let the DAGCombiner decide for vector types and large types.
16668 const EVT VT = AddNode.getValueType();
16669 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16670 return true;
16671
16672 // It is worse if c1 is legal add immediate, while c1*c2 is not
16673 // and has to be composed by at least two instructions.
16674 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16675 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16676 const int64_t C1 = C1Node->getSExtValue();
16677 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16679 return true;
16681 // Adapt to the width of a register.
16682 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16684 if (Insn.size() > 1)
16685 return false;
16686
16687 // Default to true and let the DAGCombiner decide.
16688 return true;
16689}
16690
16691// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16692// immediates is the same as for an add or a sub.
16694 return isLegalAddImmediate(Immed);
16695}
16696
16697/// isLegalAddressingMode - Return true if the addressing mode represented
16698/// by AM is legal for this target, for a load/store of the specified type.
16700 const AddrMode &AMode, Type *Ty,
16701 unsigned AS, Instruction *I) const {
16702 // AArch64 has five basic addressing modes:
16703 // reg
16704 // reg + 9-bit signed offset
16705 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16706 // reg1 + reg2
16707 // reg + SIZE_IN_BYTES * reg
16708
16709 // No global is ever allowed as a base.
16710 if (AMode.BaseGV)
16711 return false;
16712
16713 // No reg+reg+imm addressing.
16714 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16715 return false;
16716
16717 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16718 // `2*ScaledReg` into `BaseReg + ScaledReg`
16719 AddrMode AM = AMode;
16720 if (AM.Scale && !AM.HasBaseReg) {
16721 if (AM.Scale == 1) {
16722 AM.HasBaseReg = true;
16723 AM.Scale = 0;
16724 } else if (AM.Scale == 2) {
16725 AM.HasBaseReg = true;
16726 AM.Scale = 1;
16727 } else {
16728 return false;
16729 }
16730 }
16731
16732 // A base register is required in all addressing modes.
16733 if (!AM.HasBaseReg)
16734 return false;
16735
16736 if (Ty->isScalableTy()) {
16737 if (isa<ScalableVectorType>(Ty)) {
16738 // See if we have a foldable vscale-based offset, for vector types which
16739 // are either legal or smaller than the minimum; more work will be
16740 // required if we need to consider addressing for types which need
16741 // legalization by splitting.
16742 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16743 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16744 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16745 isPowerOf2_64(VecNumBytes))
16746 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
16747
16748 uint64_t VecElemNumBytes =
16749 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16750 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16751 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16752 }
16753
16754 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16755 }
16756
16757 // No scalable offsets allowed for non-scalable types.
16758 if (AM.ScalableOffset)
16759 return false;
16760
16761 // check reg + imm case:
16762 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16763 uint64_t NumBytes = 0;
16764 if (Ty->isSized()) {
16765 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16766 NumBytes = NumBits / 8;
16767 if (!isPowerOf2_64(NumBits))
16768 NumBytes = 0;
16769 }
16770
16771 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16772 AM.Scale);
16773}
16774
16775// Check whether the 2 offsets belong to the same imm24 range, and their high
16776// 12bits are same, then their high part can be decoded with the offset of add.
16777int64_t
16779 int64_t MaxOffset) const {
16780 int64_t HighPart = MinOffset & ~0xfffULL;
16781 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16782 // Rebase the value to an integer multiple of imm12.
16783 return HighPart;
16784 }
16785
16786 return 0;
16787}
16788
16790 // Consider splitting large offset of struct or array.
16791 return true;
16792}
16793
16795 const MachineFunction &MF, EVT VT) const {
16796 VT = VT.getScalarType();
16797
16798 if (!VT.isSimple())
16799 return false;
16800
16801 switch (VT.getSimpleVT().SimpleTy) {
16802 case MVT::f16:
16803 return Subtarget->hasFullFP16();
16804 case MVT::f32:
16805 case MVT::f64:
16806 return true;
16807 default:
16808 break;
16809 }
16810
16811 return false;
16812}
16813
16815 Type *Ty) const {
16816 switch (Ty->getScalarType()->getTypeID()) {
16817 case Type::FloatTyID:
16818 case Type::DoubleTyID:
16819 return true;
16820 default:
16821 return false;
16822 }
16823}
16824
16826 EVT VT, CodeGenOptLevel OptLevel) const {
16827 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16829}
16830
16831const MCPhysReg *
16833 // LR is a callee-save register, but we must treat it as clobbered by any call
16834 // site. Hence we include LR in the scratch registers, which are in turn added
16835 // as implicit-defs for stackmaps and patchpoints.
16836 static const MCPhysReg ScratchRegs[] = {
16837 AArch64::X16, AArch64::X17, AArch64::LR, 0
16838 };
16839 return ScratchRegs;
16840}
16841
16843 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16844 return RCRegs;
16845}
16846
16847bool
16849 CombineLevel Level) const {
16850 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16851 N->getOpcode() == ISD::SRL) &&
16852 "Expected shift op");
16853
16854 SDValue ShiftLHS = N->getOperand(0);
16855 EVT VT = N->getValueType(0);
16856
16857 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16858 // combine it with shift 'N' to let it be lowered to UBFX except:
16859 // ((x >> C) & mask) << C.
16860 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16861 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16862 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16863 if (isMask_64(TruncMask)) {
16864 SDValue AndLHS = ShiftLHS.getOperand(0);
16865 if (AndLHS.getOpcode() == ISD::SRL) {
16866 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16867 if (N->getOpcode() == ISD::SHL)
16868 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16869 return SRLC->getZExtValue() == SHLC->getZExtValue();
16870 return false;
16871 }
16872 }
16873 }
16874 }
16875 return true;
16876}
16877
16879 const SDNode *N) const {
16880 assert(N->getOpcode() == ISD::XOR &&
16881 (N->getOperand(0).getOpcode() == ISD::SHL ||
16882 N->getOperand(0).getOpcode() == ISD::SRL) &&
16883 "Expected XOR(SHIFT) pattern");
16884
16885 // Only commute if the entire NOT mask is a hidden shifted mask.
16886 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16887 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16888 if (XorC && ShiftC) {
16889 unsigned MaskIdx, MaskLen;
16890 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16891 unsigned ShiftAmt = ShiftC->getZExtValue();
16892 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16893 if (N->getOperand(0).getOpcode() == ISD::SHL)
16894 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16895 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16896 }
16897 }
16898
16899 return false;
16900}
16901
16903 const SDNode *N, CombineLevel Level) const {
16904 assert(((N->getOpcode() == ISD::SHL &&
16905 N->getOperand(0).getOpcode() == ISD::SRL) ||
16906 (N->getOpcode() == ISD::SRL &&
16907 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16908 "Expected shift-shift mask");
16909 // Don't allow multiuse shift folding with the same shift amount.
16910 if (!N->getOperand(0)->hasOneUse())
16911 return false;
16912
16913 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16914 EVT VT = N->getValueType(0);
16915 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16916 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16917 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16918 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16919 }
16920
16921 return true;
16922}
16923
16925 unsigned BinOpcode, EVT VT) const {
16926 return VT.isScalableVector() && isTypeLegal(VT);
16927}
16928
16930 Type *Ty) const {
16931 assert(Ty->isIntegerTy());
16932
16933 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16934 if (BitSize == 0)
16935 return false;
16936
16937 int64_t Val = Imm.getSExtValue();
16938 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16939 return true;
16940
16941 if ((int64_t)Val < 0)
16942 Val = ~Val;
16943 if (BitSize == 32)
16944 Val &= (1LL << 32) - 1;
16945
16946 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16947 // MOVZ is free so return true for one or fewer MOVK.
16948 return Shift < 3;
16949}
16950
16952 unsigned Index) const {
16954 return false;
16955
16956 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16957}
16958
16959/// Turn vector tests of the signbit in the form of:
16960/// xor (sra X, elt_size(X)-1), -1
16961/// into:
16962/// cmge X, X, #0
16964 const AArch64Subtarget *Subtarget) {
16965 EVT VT = N->getValueType(0);
16966 if (!Subtarget->hasNEON() || !VT.isVector())
16967 return SDValue();
16968
16969 // There must be a shift right algebraic before the xor, and the xor must be a
16970 // 'not' operation.
16971 SDValue Shift = N->getOperand(0);
16972 SDValue Ones = N->getOperand(1);
16973 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16975 return SDValue();
16976
16977 // The shift should be smearing the sign bit across each vector element.
16978 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16979 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16980 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16981 return SDValue();
16982
16983 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16984}
16985
16986// Given a vecreduce_add node, detect the below pattern and convert it to the
16987// node sequence with UABDL, [S|U]ADB and UADDLP.
16988//
16989// i32 vecreduce_add(
16990// v16i32 abs(
16991// v16i32 sub(
16992// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16993// =================>
16994// i32 vecreduce_add(
16995// v4i32 UADDLP(
16996// v8i16 add(
16997// v8i16 zext(
16998// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16999// v8i16 zext(
17000// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17002 SelectionDAG &DAG) {
17003 // Assumed i32 vecreduce_add
17004 if (N->getValueType(0) != MVT::i32)
17005 return SDValue();
17006
17007 SDValue VecReduceOp0 = N->getOperand(0);
17008 unsigned Opcode = VecReduceOp0.getOpcode();
17009 // Assumed v16i32 abs
17010 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
17011 return SDValue();
17012
17013 SDValue ABS = VecReduceOp0;
17014 // Assumed v16i32 sub
17015 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
17016 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
17017 return SDValue();
17018
17019 SDValue SUB = ABS->getOperand(0);
17020 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17021 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17022 // Assumed v16i32 type
17023 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17024 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17025 return SDValue();
17026
17027 // Assumed zext or sext
17028 bool IsZExt = false;
17029 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17030 IsZExt = true;
17031 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17032 IsZExt = false;
17033 } else
17034 return SDValue();
17035
17036 SDValue EXT0 = SUB->getOperand(0);
17037 SDValue EXT1 = SUB->getOperand(1);
17038 // Assumed zext's operand has v16i8 type
17039 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17040 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17041 return SDValue();
17042
17043 // Pattern is dectected. Let's convert it to sequence of nodes.
17044 SDLoc DL(N);
17045
17046 // First, create the node pattern of UABD/SABD.
17047 SDValue UABDHigh8Op0 =
17048 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17049 DAG.getConstant(8, DL, MVT::i64));
17050 SDValue UABDHigh8Op1 =
17051 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17052 DAG.getConstant(8, DL, MVT::i64));
17053 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17054 UABDHigh8Op0, UABDHigh8Op1);
17055 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17056
17057 // Second, create the node pattern of UABAL.
17058 SDValue UABDLo8Op0 =
17059 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17060 DAG.getConstant(0, DL, MVT::i64));
17061 SDValue UABDLo8Op1 =
17062 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17063 DAG.getConstant(0, DL, MVT::i64));
17064 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17065 UABDLo8Op0, UABDLo8Op1);
17066 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17067 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17068
17069 // Third, create the node of UADDLP.
17070 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17071
17072 // Fourth, create the node of VECREDUCE_ADD.
17073 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17074}
17075
17076// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17077// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17078// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17079// If we have vectors larger than v16i8 we extract v16i8 vectors,
17080// Follow the same steps above to get DOT instructions concatenate them
17081// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17083 const AArch64Subtarget *ST) {
17084 if (!ST->hasDotProd())
17086
17087 SDValue Op0 = N->getOperand(0);
17088 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17089 Op0.getValueType().getVectorElementType() != MVT::i32)
17090 return SDValue();
17091
17092 unsigned ExtOpcode = Op0.getOpcode();
17093 SDValue A = Op0;
17094 SDValue B;
17095 if (ExtOpcode == ISD::MUL) {
17096 A = Op0.getOperand(0);
17097 B = Op0.getOperand(1);
17098 if (A.getOpcode() != B.getOpcode() ||
17099 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17100 return SDValue();
17101 ExtOpcode = A.getOpcode();
17102 }
17103 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17104 return SDValue();
17105
17106 EVT Op0VT = A.getOperand(0).getValueType();
17107 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17108 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17109 if (!IsValidElementCount || !IsValidSize)
17110 return SDValue();
17111
17112 SDLoc DL(Op0);
17113 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17114 // the extend B.
17115 if (!B)
17116 B = DAG.getConstant(1, DL, Op0VT);
17117 else
17118 B = B.getOperand(0);
17119
17120 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17121 unsigned NumOfVecReduce;
17122 EVT TargetType;
17123 if (IsMultipleOf16) {
17124 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17125 TargetType = MVT::v4i32;
17126 } else {
17127 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17128 TargetType = MVT::v2i32;
17129 }
17130 auto DotOpcode =
17132 // Handle the case where we need to generate only one Dot operation.
17133 if (NumOfVecReduce == 1) {
17134 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17135 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17136 A.getOperand(0), B);
17137 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17138 }
17139 // Generate Dot instructions that are multiple of 16.
17140 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17141 SmallVector<SDValue, 4> SDotVec16;
17142 unsigned I = 0;
17143 for (; I < VecReduce16Num; I += 1) {
17144 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17145 SDValue Op0 =
17146 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17147 DAG.getConstant(I * 16, DL, MVT::i64));
17148 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17149 DAG.getConstant(I * 16, DL, MVT::i64));
17150 SDValue Dot =
17151 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17152 SDotVec16.push_back(Dot);
17153 }
17154 // Concatenate dot operations.
17155 EVT SDot16EVT =
17156 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17157 SDValue ConcatSDot16 =
17158 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17159 SDValue VecReduceAdd16 =
17160 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17161 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17162 if (VecReduce8Num == 0)
17163 return VecReduceAdd16;
17164
17165 // Generate the remainder Dot operation that is multiple of 8.
17166 SmallVector<SDValue, 4> SDotVec8;
17167 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17168 SDValue Vec8Op0 =
17169 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17170 DAG.getConstant(I * 16, DL, MVT::i64));
17171 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17172 DAG.getConstant(I * 16, DL, MVT::i64));
17173 SDValue Dot =
17174 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17175 SDValue VecReudceAdd8 =
17176 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17177 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17178 VecReudceAdd8);
17179}
17180
17181// Given an (integer) vecreduce, we know the order of the inputs does not
17182// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17183// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17184// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17186 auto DetectAddExtract = [&](SDValue A) {
17187 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17188 // UADDLP(x) if found.
17189 assert(A.getOpcode() == ISD::ADD);
17190 EVT VT = A.getValueType();
17191 SDValue Op0 = A.getOperand(0);
17192 SDValue Op1 = A.getOperand(1);
17193 if (Op0.getOpcode() != Op0.getOpcode() ||
17194 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17195 Op0.getOpcode() != ISD::SIGN_EXTEND))
17196 return SDValue();
17197 SDValue Ext0 = Op0.getOperand(0);
17198 SDValue Ext1 = Op1.getOperand(0);
17199 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17201 Ext0.getOperand(0) != Ext1.getOperand(0))
17202 return SDValue();
17203 // Check that the type is twice the add types, and the extract are from
17204 // upper/lower parts of the same source.
17206 VT.getVectorNumElements() * 2)
17207 return SDValue();
17208 if ((Ext0.getConstantOperandVal(1) != 0 ||
17210 (Ext1.getConstantOperandVal(1) != 0 ||
17212 return SDValue();
17213 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17215 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17216 };
17217
17218 if (SDValue R = DetectAddExtract(A))
17219 return R;
17220
17221 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17222 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17223 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17224 A.getOperand(1));
17225 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17226 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17227 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17228 A.getOperand(0));
17229 return SDValue();
17230}
17231
17232// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17233// UADDLV(concat), where the concat represents the 64-bit zext sources.
17235 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17236 // UADDLV(concat(zext, zext)) if found.
17237 assert(A.getOpcode() == ISD::ADD);
17238 EVT VT = A.getValueType();
17239 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17240 return SDValue();
17241 SDValue Op0 = A.getOperand(0);
17242 SDValue Op1 = A.getOperand(1);
17243 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17244 return SDValue();
17245 SDValue Ext0 = Op0.getOperand(0);
17246 SDValue Ext1 = Op1.getOperand(0);
17247 EVT ExtVT0 = Ext0.getValueType();
17248 EVT ExtVT1 = Ext1.getValueType();
17249 // Check zext VTs are the same and 64-bit length.
17250 if (ExtVT0 != ExtVT1 ||
17251 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17252 return SDValue();
17253 // Get VT for concat of zext sources.
17254 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17255 SDValue Concat =
17256 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17257
17258 switch (VT.getSimpleVT().SimpleTy) {
17259 case MVT::v2i64:
17260 case MVT::v4i32:
17261 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17262 case MVT::v8i16: {
17263 SDValue Uaddlv =
17264 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17265 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17266 }
17267 default:
17268 llvm_unreachable("Unhandled vector type");
17269 }
17270}
17271
17273 SDValue A = N->getOperand(0);
17274 if (A.getOpcode() == ISD::ADD) {
17275 if (SDValue R = performUADDVAddCombine(A, DAG))
17276 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17277 else if (SDValue R = performUADDVZextCombine(A, DAG))
17278 return R;
17279 }
17280 return SDValue();
17281}
17282
17285 const AArch64Subtarget *Subtarget) {
17286 if (DCI.isBeforeLegalizeOps())
17287 return SDValue();
17288
17289 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17290}
17291
17292SDValue
17293AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17294 SelectionDAG &DAG,
17295 SmallVectorImpl<SDNode *> &Created) const {
17297 if (isIntDivCheap(N->getValueType(0), Attr))
17298 return SDValue(N,0); // Lower SDIV as SDIV
17299
17300 EVT VT = N->getValueType(0);
17301
17302 // For scalable and fixed types, mark them as cheap so we can handle it much
17303 // later. This allows us to handle larger than legal types.
17304 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17305 return SDValue(N, 0);
17306
17307 // fold (sdiv X, pow2)
17308 if ((VT != MVT::i32 && VT != MVT::i64) ||
17309 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17310 return SDValue();
17311
17312 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17313}
17314
17315SDValue
17316AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17317 SelectionDAG &DAG,
17318 SmallVectorImpl<SDNode *> &Created) const {
17320 if (isIntDivCheap(N->getValueType(0), Attr))
17321 return SDValue(N, 0); // Lower SREM as SREM
17322
17323 EVT VT = N->getValueType(0);
17324
17325 // For scalable and fixed types, mark them as cheap so we can handle it much
17326 // later. This allows us to handle larger than legal types.
17327 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17328 return SDValue(N, 0);
17329
17330 // fold (srem X, pow2)
17331 if ((VT != MVT::i32 && VT != MVT::i64) ||
17332 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17333 return SDValue();
17334
17335 unsigned Lg2 = Divisor.countr_zero();
17336 if (Lg2 == 0)
17337 return SDValue();
17338
17339 SDLoc DL(N);
17340 SDValue N0 = N->getOperand(0);
17341 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17342 SDValue Zero = DAG.getConstant(0, DL, VT);
17343 SDValue CCVal, CSNeg;
17344 if (Lg2 == 1) {
17345 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17346 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17347 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17348
17349 Created.push_back(Cmp.getNode());
17350 Created.push_back(And.getNode());
17351 } else {
17352 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17353 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17354
17355 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17356 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17357 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17358 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17359 Negs.getValue(1));
17360
17361 Created.push_back(Negs.getNode());
17362 Created.push_back(AndPos.getNode());
17363 Created.push_back(AndNeg.getNode());
17364 }
17365
17366 return CSNeg;
17367}
17368
17369static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17370 switch(getIntrinsicID(S.getNode())) {
17371 default:
17372 break;
17373 case Intrinsic::aarch64_sve_cntb:
17374 return 8;
17375 case Intrinsic::aarch64_sve_cnth:
17376 return 16;
17377 case Intrinsic::aarch64_sve_cntw:
17378 return 32;
17379 case Intrinsic::aarch64_sve_cntd:
17380 return 64;
17381 }
17382 return {};
17383}
17384
17385/// Calculates what the pre-extend type is, based on the extension
17386/// operation node provided by \p Extend.
17387///
17388/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17389/// pre-extend type is pulled directly from the operand, while other extend
17390/// operations need a bit more inspection to get this information.
17391///
17392/// \param Extend The SDNode from the DAG that represents the extend operation
17393///
17394/// \returns The type representing the \p Extend source type, or \p MVT::Other
17395/// if no valid type can be determined
17397 switch (Extend.getOpcode()) {
17398 case ISD::SIGN_EXTEND:
17399 case ISD::ZERO_EXTEND:
17400 return Extend.getOperand(0).getValueType();
17401 case ISD::AssertSext:
17402 case ISD::AssertZext:
17404 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17405 if (!TypeNode)
17406 return MVT::Other;
17407 return TypeNode->getVT();
17408 }
17409 case ISD::AND: {
17411 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17412 if (!Constant)
17413 return MVT::Other;
17414
17415 uint32_t Mask = Constant->getZExtValue();
17416
17417 if (Mask == UCHAR_MAX)
17418 return MVT::i8;
17419 else if (Mask == USHRT_MAX)
17420 return MVT::i16;
17421 else if (Mask == UINT_MAX)
17422 return MVT::i32;
17423
17424 return MVT::Other;
17425 }
17426 default:
17427 return MVT::Other;
17428 }
17429}
17430
17431/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17432/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17433/// SExt/ZExt rather than the scalar SExt/ZExt
17435 EVT VT = BV.getValueType();
17436 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17438 return SDValue();
17439
17440 // Use the first item in the buildvector/shuffle to get the size of the
17441 // extend, and make sure it looks valid.
17442 SDValue Extend = BV->getOperand(0);
17443 unsigned ExtendOpcode = Extend.getOpcode();
17444 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17445 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17446 ExtendOpcode == ISD::AssertSext;
17447 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17448 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17449 return SDValue();
17450 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17451 // calculatePreExtendType will work without issue.
17452 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17453 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17454 return SDValue();
17455
17456 // Restrict valid pre-extend data type
17457 EVT PreExtendType = calculatePreExtendType(Extend);
17458 if (PreExtendType == MVT::Other ||
17459 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17460 return SDValue();
17461
17462 // Make sure all other operands are equally extended
17463 for (SDValue Op : drop_begin(BV->ops())) {
17464 if (Op.isUndef())
17465 continue;
17466 unsigned Opc = Op.getOpcode();
17467 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17468 Opc == ISD::AssertSext;
17469 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17470 return SDValue();
17471 }
17472
17473 SDValue NBV;
17474 SDLoc DL(BV);
17475 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17476 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17477 EVT PreExtendLegalType =
17478 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17480 for (SDValue Op : BV->ops())
17481 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17482 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17483 PreExtendLegalType));
17484 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17485 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17486 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17487 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17488 BV.getOperand(1).isUndef()
17489 ? DAG.getUNDEF(PreExtendVT)
17490 : BV.getOperand(1).getOperand(0),
17491 cast<ShuffleVectorSDNode>(BV)->getMask());
17492 }
17493 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17494}
17495
17496/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17497/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17499 // If the value type isn't a vector, none of the operands are going to be dups
17500 EVT VT = Mul->getValueType(0);
17501 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17502 return SDValue();
17503
17504 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17505 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17506
17507 // Neither operands have been changed, don't make any further changes
17508 if (!Op0 && !Op1)
17509 return SDValue();
17510
17511 SDLoc DL(Mul);
17512 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17513 Op1 ? Op1 : Mul->getOperand(1));
17514}
17515
17516// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17517// Same for other types with equivalent constants.
17519 EVT VT = N->getValueType(0);
17520 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17521 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17522 return SDValue();
17523 if (N->getOperand(0).getOpcode() != ISD::AND ||
17524 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17525 return SDValue();
17526
17527 SDValue And = N->getOperand(0);
17528 SDValue Srl = And.getOperand(0);
17529
17530 APInt V1, V2, V3;
17531 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17532 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17534 return SDValue();
17535
17536 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17537 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17538 V3 != (HalfSize - 1))
17539 return SDValue();
17540
17541 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17542 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17543 VT.getVectorElementCount() * 2);
17544
17545 SDLoc DL(N);
17546 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17547 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17548 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17549}
17550
17553 const AArch64Subtarget *Subtarget) {
17554
17555 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17556 return Ext;
17558 return Ext;
17559
17560 if (DCI.isBeforeLegalizeOps())
17561 return SDValue();
17562
17563 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17564 // and in MachineCombiner pass, add+mul will be combined into madd.
17565 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17566 SDLoc DL(N);
17567 EVT VT = N->getValueType(0);
17568 SDValue N0 = N->getOperand(0);
17569 SDValue N1 = N->getOperand(1);
17570 SDValue MulOper;
17571 unsigned AddSubOpc;
17572
17573 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17574 AddSubOpc = V->getOpcode();
17575 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17576 SDValue Opnd = V->getOperand(1);
17577 MulOper = V->getOperand(0);
17578 if (AddSubOpc == ISD::SUB)
17579 std::swap(Opnd, MulOper);
17580 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17581 return C->isOne();
17582 }
17583 return false;
17584 };
17585
17586 if (IsAddSubWith1(N0)) {
17587 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17588 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17589 }
17590
17591 if (IsAddSubWith1(N1)) {
17592 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17593 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17594 }
17595
17596 // The below optimizations require a constant RHS.
17597 if (!isa<ConstantSDNode>(N1))
17598 return SDValue();
17599
17600 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17601 const APInt &ConstValue = C->getAPIntValue();
17602
17603 // Allow the scaling to be folded into the `cnt` instruction by preventing
17604 // the scaling to be obscured here. This makes it easier to pattern match.
17605 if (IsSVECntIntrinsic(N0) ||
17606 (N0->getOpcode() == ISD::TRUNCATE &&
17607 (IsSVECntIntrinsic(N0->getOperand(0)))))
17608 if (ConstValue.sge(1) && ConstValue.sle(16))
17609 return SDValue();
17610
17611 // Multiplication of a power of two plus/minus one can be done more
17612 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17613 // future CPUs have a cheaper MADD instruction, this may need to be
17614 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17615 // 64-bit is 5 cycles, so this is always a win.
17616 // More aggressively, some multiplications N0 * C can be lowered to
17617 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17618 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17619 // TODO: lower more cases.
17620
17621 // TrailingZeroes is used to test if the mul can be lowered to
17622 // shift+add+shift.
17623 unsigned TrailingZeroes = ConstValue.countr_zero();
17624 if (TrailingZeroes) {
17625 // Conservatively do not lower to shift+add+shift if the mul might be
17626 // folded into smul or umul.
17627 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17628 isZeroExtended(N0, DAG)))
17629 return SDValue();
17630 // Conservatively do not lower to shift+add+shift if the mul might be
17631 // folded into madd or msub.
17632 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17633 N->use_begin()->getOpcode() == ISD::SUB))
17634 return SDValue();
17635 }
17636 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17637 // and shift+add+shift.
17638 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17639 unsigned ShiftAmt;
17640
17641 auto Shl = [&](SDValue N0, unsigned N1) {
17642 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17643 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17644 };
17645 auto Add = [&](SDValue N0, SDValue N1) {
17646 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17647 };
17648 auto Sub = [&](SDValue N0, SDValue N1) {
17649 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17650 };
17651 auto Negate = [&](SDValue N) {
17652 SDValue Zero = DAG.getConstant(0, DL, VT);
17653 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17654 };
17655
17656 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17657 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17658 // the (2^N - 1) can't be execused via a single instruction.
17659 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17660 unsigned BitWidth = C.getBitWidth();
17661 for (unsigned i = 1; i < BitWidth / 2; i++) {
17662 APInt Rem;
17663 APInt X(BitWidth, (1 << i) + 1);
17664 APInt::sdivrem(C, X, N, Rem);
17665 APInt NVMinus1 = N - 1;
17666 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17667 M = X;
17668 return true;
17669 }
17670 }
17671 return false;
17672 };
17673
17674 if (ConstValue.isNonNegative()) {
17675 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17676 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17677 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17678 // (mul x, (2^M + 1) * (2^N + 1))
17679 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17680 APInt SCVMinus1 = ShiftedConstValue - 1;
17681 APInt SCVPlus1 = ShiftedConstValue + 1;
17682 APInt CVPlus1 = ConstValue + 1;
17683 APInt CVM, CVN;
17684 if (SCVMinus1.isPowerOf2()) {
17685 ShiftAmt = SCVMinus1.logBase2();
17686 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17687 } else if (CVPlus1.isPowerOf2()) {
17688 ShiftAmt = CVPlus1.logBase2();
17689 return Sub(Shl(N0, ShiftAmt), N0);
17690 } else if (SCVPlus1.isPowerOf2()) {
17691 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17692 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17693 } else if (Subtarget->hasALULSLFast() &&
17694 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17695 APInt CVMMinus1 = CVM - 1;
17696 APInt CVNMinus1 = CVN - 1;
17697 unsigned ShiftM1 = CVMMinus1.logBase2();
17698 unsigned ShiftN1 = CVNMinus1.logBase2();
17699 // LSLFast implicate that Shifts <= 3 places are fast
17700 if (ShiftM1 <= 3 && ShiftN1 <= 3) {
17701 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17702 return Add(Shl(MVal, ShiftN1), MVal);
17703 }
17704 }
17705 } else {
17706 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17707 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17708 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17709 APInt SCVPlus1 = -ShiftedConstValue + 1;
17710 APInt CVNegPlus1 = -ConstValue + 1;
17711 APInt CVNegMinus1 = -ConstValue - 1;
17712 if (CVNegPlus1.isPowerOf2()) {
17713 ShiftAmt = CVNegPlus1.logBase2();
17714 return Sub(N0, Shl(N0, ShiftAmt));
17715 } else if (CVNegMinus1.isPowerOf2()) {
17716 ShiftAmt = CVNegMinus1.logBase2();
17717 return Negate(Add(Shl(N0, ShiftAmt), N0));
17718 } else if (SCVPlus1.isPowerOf2()) {
17719 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17720 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17721 }
17722 }
17723
17724 return SDValue();
17725}
17726
17728 SelectionDAG &DAG) {
17729 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17730 // optimize away operation when it's from a constant.
17731 //
17732 // The general transformation is:
17733 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17734 // AND(VECTOR_CMP(x,y), constant2)
17735 // constant2 = UNARYOP(constant)
17736
17737 // Early exit if this isn't a vector operation, the operand of the
17738 // unary operation isn't a bitwise AND, or if the sizes of the operations
17739 // aren't the same.
17740 EVT VT = N->getValueType(0);
17741 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17742 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17743 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17744 return SDValue();
17745
17746 // Now check that the other operand of the AND is a constant. We could
17747 // make the transformation for non-constant splats as well, but it's unclear
17748 // that would be a benefit as it would not eliminate any operations, just
17749 // perform one more step in scalar code before moving to the vector unit.
17750 if (BuildVectorSDNode *BV =
17751 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17752 // Bail out if the vector isn't a constant.
17753 if (!BV->isConstant())
17754 return SDValue();
17755
17756 // Everything checks out. Build up the new and improved node.
17757 SDLoc DL(N);
17758 EVT IntVT = BV->getValueType(0);
17759 // Create a new constant of the appropriate type for the transformed
17760 // DAG.
17761 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17762 // The AND node needs bitcasts to/from an integer vector type around it.
17763 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17764 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17765 N->getOperand(0)->getOperand(0), MaskConst);
17766 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17767 return Res;
17768 }
17769
17770 return SDValue();
17771}
17772
17774 const AArch64Subtarget *Subtarget) {
17775 // First try to optimize away the conversion when it's conditionally from
17776 // a constant. Vectors only.
17778 return Res;
17779
17780 EVT VT = N->getValueType(0);
17781 if (VT != MVT::f32 && VT != MVT::f64)
17782 return SDValue();
17783
17784 // Only optimize when the source and destination types have the same width.
17785 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17786 return SDValue();
17787
17788 // If the result of an integer load is only used by an integer-to-float
17789 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17790 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17791 SDValue N0 = N->getOperand(0);
17792 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17793 N0.hasOneUse() &&
17794 // Do not change the width of a volatile load.
17795 !cast<LoadSDNode>(N0)->isVolatile()) {
17796 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17797 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17798 LN0->getPointerInfo(), LN0->getAlign(),
17799 LN0->getMemOperand()->getFlags());
17800
17801 // Make sure successors of the original load stay after it by updating them
17802 // to use the new Chain.
17803 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17804
17805 unsigned Opcode =
17807 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17808 }
17809
17810 return SDValue();
17811}
17812
17813/// Fold a floating-point multiply by power of two into floating-point to
17814/// fixed-point conversion.
17817 const AArch64Subtarget *Subtarget) {
17818 if (!Subtarget->isNeonAvailable())
17819 return SDValue();
17820
17821 if (!N->getValueType(0).isSimple())
17822 return SDValue();
17823
17824 SDValue Op = N->getOperand(0);
17825 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17826 return SDValue();
17827
17828 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17829 return SDValue();
17830
17831 SDValue ConstVec = Op->getOperand(1);
17832 if (!isa<BuildVectorSDNode>(ConstVec))
17833 return SDValue();
17834
17835 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17836 uint32_t FloatBits = FloatTy.getSizeInBits();
17837 if (FloatBits != 32 && FloatBits != 64 &&
17838 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17839 return SDValue();
17840
17841 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17842 uint32_t IntBits = IntTy.getSizeInBits();
17843 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17844 return SDValue();
17845
17846 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17847 if (IntBits > FloatBits)
17848 return SDValue();
17849
17850 BitVector UndefElements;
17851 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17852 int32_t Bits = IntBits == 64 ? 64 : 32;
17853 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17854 if (C == -1 || C == 0 || C > Bits)
17855 return SDValue();
17856
17857 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17858 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17859 return SDValue();
17860
17861 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17862 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17863 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17864 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17865 return SDValue();
17866 }
17867
17868 SDLoc DL(N);
17869 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17870 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17871 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17872 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17873 SDValue FixConv =
17875 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17876 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17877 // We can handle smaller integers by generating an extra trunc.
17878 if (IntBits < FloatBits)
17879 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17880
17881 return FixConv;
17882}
17883
17884/// Fold a floating-point divide by power of two into fixed-point to
17885/// floating-point conversion.
17888 const AArch64Subtarget *Subtarget) {
17889 if (!Subtarget->hasNEON())
17890 return SDValue();
17891
17892 SDValue Op = N->getOperand(0);
17893 unsigned Opc = Op->getOpcode();
17894 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17895 !Op.getOperand(0).getValueType().isSimple() ||
17896 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17897 return SDValue();
17898
17899 SDValue ConstVec = N->getOperand(1);
17900 if (!isa<BuildVectorSDNode>(ConstVec))
17901 return SDValue();
17902
17903 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17904 int32_t IntBits = IntTy.getSizeInBits();
17905 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17906 return SDValue();
17907
17908 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17909 int32_t FloatBits = FloatTy.getSizeInBits();
17910 if (FloatBits != 32 && FloatBits != 64)
17911 return SDValue();
17912
17913 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17914 if (IntBits > FloatBits)
17915 return SDValue();
17916
17917 BitVector UndefElements;
17918 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17919 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17920 if (C == -1 || C == 0 || C > FloatBits)
17921 return SDValue();
17922
17923 MVT ResTy;
17924 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17925 switch (NumLanes) {
17926 default:
17927 return SDValue();
17928 case 2:
17929 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17930 break;
17931 case 4:
17932 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17933 break;
17934 }
17935
17936 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17937 return SDValue();
17938
17939 SDLoc DL(N);
17940 SDValue ConvInput = Op.getOperand(0);
17941 bool IsSigned = Opc == ISD::SINT_TO_FP;
17942 if (IntBits < FloatBits)
17943 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17944 ResTy, ConvInput);
17945
17946 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17947 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17948 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17949 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17950 DAG.getConstant(C, DL, MVT::i32));
17951}
17952
17954 const AArch64TargetLowering &TLI) {
17955 EVT VT = N->getValueType(0);
17956 SelectionDAG &DAG = DCI.DAG;
17957 SDLoc DL(N);
17958
17959 if (!VT.isVector())
17960 return SDValue();
17961
17962 // The combining code currently only works for NEON vectors. In particular,
17963 // it does not work for SVE when dealing with vectors wider than 128 bits.
17964 // It also doesn't work for streaming mode because it causes generating
17965 // bsl instructions that are invalid in streaming mode.
17968 return SDValue();
17969
17970 SDValue N0 = N->getOperand(0);
17971 if (N0.getOpcode() != ISD::AND)
17972 return SDValue();
17973
17974 SDValue N1 = N->getOperand(1);
17975 if (N1.getOpcode() != ISD::AND)
17976 return SDValue();
17977
17978 // InstCombine does (not (neg a)) => (add a -1).
17979 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17980 // Loop over all combinations of AND operands.
17981 for (int i = 1; i >= 0; --i) {
17982 for (int j = 1; j >= 0; --j) {
17983 SDValue O0 = N0->getOperand(i);
17984 SDValue O1 = N1->getOperand(j);
17985 SDValue Sub, Add, SubSibling, AddSibling;
17986
17987 // Find a SUB and an ADD operand, one from each AND.
17988 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17989 Sub = O0;
17990 Add = O1;
17991 SubSibling = N0->getOperand(1 - i);
17992 AddSibling = N1->getOperand(1 - j);
17993 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17994 Add = O0;
17995 Sub = O1;
17996 AddSibling = N0->getOperand(1 - i);
17997 SubSibling = N1->getOperand(1 - j);
17998 } else
17999 continue;
18000
18002 continue;
18003
18004 // Constant ones is always righthand operand of the Add.
18005 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
18006 continue;
18007
18008 if (Sub.getOperand(1) != Add.getOperand(0))
18009 continue;
18010
18011 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18012 }
18013 }
18014
18015 // (or (and a b) (and (not a) c)) => (bsl a b c)
18016 // We only have to look for constant vectors here since the general, variable
18017 // case can be handled in TableGen.
18018 unsigned Bits = VT.getScalarSizeInBits();
18019 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18020 for (int i = 1; i >= 0; --i)
18021 for (int j = 1; j >= 0; --j) {
18022 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18023 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18024 if (!BVN0 || !BVN1)
18025 continue;
18026
18027 bool FoundMatch = true;
18028 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18029 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18030 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18031 if (!CN0 || !CN1 ||
18032 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18033 FoundMatch = false;
18034 break;
18035 }
18036 }
18037
18038 if (FoundMatch)
18039 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
18040 N0->getOperand(1 - i), N1->getOperand(1 - j));
18041 }
18042
18043 return SDValue();
18044}
18045
18046// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18047// convert to csel(ccmp(.., cc0)), depending on cc1:
18048
18049// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18050// =>
18051// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18052//
18053// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18054// =>
18055// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18057 EVT VT = N->getValueType(0);
18058 SDValue CSel0 = N->getOperand(0);
18059 SDValue CSel1 = N->getOperand(1);
18060
18061 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18062 CSel1.getOpcode() != AArch64ISD::CSEL)
18063 return SDValue();
18064
18065 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18066 return SDValue();
18067
18068 if (!isNullConstant(CSel0.getOperand(0)) ||
18069 !isOneConstant(CSel0.getOperand(1)) ||
18070 !isNullConstant(CSel1.getOperand(0)) ||
18071 !isOneConstant(CSel1.getOperand(1)))
18072 return SDValue();
18073
18074 SDValue Cmp0 = CSel0.getOperand(3);
18075 SDValue Cmp1 = CSel1.getOperand(3);
18078 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18079 return SDValue();
18080 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18081 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18082 std::swap(Cmp0, Cmp1);
18083 std::swap(CC0, CC1);
18084 }
18085
18086 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18087 return SDValue();
18088
18089 SDLoc DL(N);
18090 SDValue CCmp, Condition;
18091 unsigned NZCV;
18092
18093 if (N->getOpcode() == ISD::AND) {
18095 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18097 } else {
18099 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18101 }
18102
18103 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18104
18105 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18106 if (Op1 && Op1->getAPIntValue().isNegative() &&
18107 Op1->getAPIntValue().sgt(-32)) {
18108 // CCMP accept the constant int the range [0, 31]
18109 // if the Op1 is a constant in the range [-31, -1], we
18110 // can select to CCMN to avoid the extra mov
18111 SDValue AbsOp1 =
18112 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18113 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18114 NZCVOp, Condition, Cmp0);
18115 } else {
18116 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18117 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18118 }
18119 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18120 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18121 CCmp);
18122}
18123
18125 const AArch64Subtarget *Subtarget,
18126 const AArch64TargetLowering &TLI) {
18127 SelectionDAG &DAG = DCI.DAG;
18128 EVT VT = N->getValueType(0);
18129
18130 if (SDValue R = performANDORCSELCombine(N, DAG))
18131 return R;
18132
18133 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18134 return SDValue();
18135
18136 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18137 return Res;
18138
18139 return SDValue();
18140}
18141
18143 if (!MemVT.getVectorElementType().isSimple())
18144 return false;
18145
18146 uint64_t MaskForTy = 0ull;
18147 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18148 case MVT::i8:
18149 MaskForTy = 0xffull;
18150 break;
18151 case MVT::i16:
18152 MaskForTy = 0xffffull;
18153 break;
18154 case MVT::i32:
18155 MaskForTy = 0xffffffffull;
18156 break;
18157 default:
18158 return false;
18159 break;
18160 }
18161
18162 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18163 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18164 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18165
18166 return false;
18167}
18168
18170 SDValue LeafOp = SDValue(N, 0);
18171 SDValue Op = N->getOperand(0);
18172 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18173 LeafOp.getValueType() != Op.getValueType())
18174 Op = Op->getOperand(0);
18175 if (LeafOp.getValueType() == Op.getValueType())
18176 return Op;
18177 return SDValue();
18178}
18179
18182 SelectionDAG &DAG = DCI.DAG;
18183 SDValue Src = N->getOperand(0);
18184 unsigned Opc = Src->getOpcode();
18185
18186 // Zero/any extend of an unsigned unpack
18187 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18188 SDValue UnpkOp = Src->getOperand(0);
18189 SDValue Dup = N->getOperand(1);
18190
18191 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18192 return SDValue();
18193
18194 SDLoc DL(N);
18195 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18196 if (!C)
18197 return SDValue();
18198
18199 uint64_t ExtVal = C->getZExtValue();
18200
18201 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18202 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18203 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18204 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18205 };
18206
18207 // If the mask is fully covered by the unpack, we don't need to push
18208 // a new AND onto the operand
18209 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18210 if (MaskAndTypeMatch(EltTy))
18211 return Src;
18212
18213 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18214 // to see if the mask is all-ones of size MemTy.
18215 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18216 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18217 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18218 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18219 if (MaskAndTypeMatch(EltTy))
18220 return Src;
18221 }
18222
18223 // Truncate to prevent a DUP with an over wide constant
18224 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18225
18226 // Otherwise, make sure we propagate the AND to the operand
18227 // of the unpack
18228 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18229 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18230
18231 SDValue And = DAG.getNode(ISD::AND, DL,
18232 UnpkOp->getValueType(0), UnpkOp, Dup);
18233
18234 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18235 }
18236
18237 if (DCI.isBeforeLegalizeOps())
18238 return SDValue();
18239
18240 // If both sides of AND operations are i1 splat_vectors then
18241 // we can produce just i1 splat_vector as the result.
18242 if (isAllActivePredicate(DAG, N->getOperand(0)))
18243 return N->getOperand(1);
18244 if (isAllActivePredicate(DAG, N->getOperand(1)))
18245 return N->getOperand(0);
18246
18248 return SDValue();
18249
18250 SDValue Mask = N->getOperand(1);
18251
18252 if (!Src.hasOneUse())
18253 return SDValue();
18254
18255 EVT MemVT;
18256
18257 // SVE load instructions perform an implicit zero-extend, which makes them
18258 // perfect candidates for combining.
18259 switch (Opc) {
18263 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18264 break;
18280 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18281 break;
18282 default:
18283 return SDValue();
18284 }
18285
18286 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18287 return Src;
18288
18289 return SDValue();
18290}
18291
18292// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18295
18296 // This function performs an optimization on a specific pattern involving
18297 // an AND operation and SETCC (Set Condition Code) node.
18298
18299 SDValue SetCC = N->getOperand(0);
18300 EVT VT = N->getValueType(0);
18301 SelectionDAG &DAG = DCI.DAG;
18302
18303 // Checks if the current node (N) is used by any SELECT instruction and
18304 // returns an empty SDValue to avoid applying the optimization to prevent
18305 // incorrect results
18306 for (auto U : N->uses())
18307 if (U->getOpcode() == ISD::SELECT)
18308 return SDValue();
18309
18310 // Check if the operand is a SETCC node with floating-point comparison
18311 if (SetCC.getOpcode() == ISD::SETCC &&
18312 SetCC.getOperand(0).getValueType() == MVT::f32) {
18313
18314 SDValue Cmp;
18316
18317 // Check if the DAG is after legalization and if we can emit the conjunction
18318 if (!DCI.isBeforeLegalize() &&
18319 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18320
18322
18323 SDLoc DL(N);
18324 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18325 DAG.getConstant(0, DL, VT),
18326 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18327 }
18328 }
18329 return SDValue();
18330}
18331
18334 SelectionDAG &DAG = DCI.DAG;
18335 SDValue LHS = N->getOperand(0);
18336 SDValue RHS = N->getOperand(1);
18337 EVT VT = N->getValueType(0);
18338
18339 if (SDValue R = performANDORCSELCombine(N, DAG))
18340 return R;
18341
18342 if (SDValue R = performANDSETCCCombine(N,DCI))
18343 return R;
18344
18345 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18346 return SDValue();
18347
18348 if (VT.isScalableVector())
18349 return performSVEAndCombine(N, DCI);
18350
18351 // The combining code below works only for NEON vectors. In particular, it
18352 // does not work for SVE when dealing with vectors wider than 128 bits.
18353 if (!VT.is64BitVector() && !VT.is128BitVector())
18354 return SDValue();
18355
18356 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18357 if (!BVN)
18358 return SDValue();
18359
18360 // AND does not accept an immediate, so check if we can use a BIC immediate
18361 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18362 // pattern in isel, because some immediates may be lowered to the preferred
18363 // (and x, (movi imm)) form, even though an mvni representation also exists.
18364 APInt DefBits(VT.getSizeInBits(), 0);
18365 APInt UndefBits(VT.getSizeInBits(), 0);
18366 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18367 SDValue NewOp;
18368
18369 // Any bits known to already be 0 need not be cleared again, which can help
18370 // reduce the size of the immediate to one supported by the instruction.
18371 KnownBits Known = DAG.computeKnownBits(LHS);
18372 APInt ZeroSplat(VT.getSizeInBits(), 0);
18373 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18374 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18375 << (Known.Zero.getBitWidth() * I);
18376
18377 DefBits = ~(DefBits | ZeroSplat);
18378 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18379 DefBits, &LHS)) ||
18380 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18381 DefBits, &LHS)))
18382 return NewOp;
18383
18384 UndefBits = ~(UndefBits | ZeroSplat);
18385 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18386 UndefBits, &LHS)) ||
18387 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18388 UndefBits, &LHS)))
18389 return NewOp;
18390 }
18391
18392 return SDValue();
18393}
18394
18397 SelectionDAG &DAG = DCI.DAG;
18398 SDValue LHS = N->getOperand(0);
18399 SDValue RHS = N->getOperand(1);
18400 EVT VT = N->getValueType(0);
18401 SDLoc DL(N);
18402
18403 if (!N->getFlags().hasAllowReassociation())
18404 return SDValue();
18405
18406 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18407 auto ReassocComplex = [&](SDValue A, SDValue B) {
18408 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18409 return SDValue();
18410 unsigned Opc = A.getConstantOperandVal(0);
18411 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18412 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18413 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18414 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18415 return SDValue();
18416 SDValue VCMLA = DAG.getNode(
18417 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18418 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18419 A.getOperand(2), A.getOperand(3));
18420 VCMLA->setFlags(A->getFlags());
18421 return VCMLA;
18422 };
18423 if (SDValue R = ReassocComplex(LHS, RHS))
18424 return R;
18425 if (SDValue R = ReassocComplex(RHS, LHS))
18426 return R;
18427
18428 return SDValue();
18429}
18430
18431static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18432 switch (Opcode) {
18433 case ISD::STRICT_FADD:
18434 case ISD::FADD:
18435 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18436 case ISD::ADD:
18437 return VT == MVT::i64;
18438 default:
18439 return false;
18440 }
18441}
18442
18443static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18445
18447 if ((N.getOpcode() == ISD::SETCC) ||
18448 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18449 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18450 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18451 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18452 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18453 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18454 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18455 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18456 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18457 // get_active_lane_mask is lowered to a whilelo instruction.
18458 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18459 return true;
18460
18461 return false;
18462}
18463
18464// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18465// ... into: "ptrue p, all" + PTEST
18466static SDValue
18469 const AArch64Subtarget *Subtarget) {
18470 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18471 // Make sure PTEST can be legalised with illegal types.
18472 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18473 return SDValue();
18474
18475 SDValue N0 = N->getOperand(0);
18476 EVT VT = N0.getValueType();
18477
18478 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18479 !isNullConstant(N->getOperand(1)))
18480 return SDValue();
18481
18482 // Restricted the DAG combine to only cases where we're extracting from a
18483 // flag-setting operation.
18484 if (!isPredicateCCSettingOp(N0))
18485 return SDValue();
18486
18487 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18488 SelectionDAG &DAG = DCI.DAG;
18489 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18490 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18491}
18492
18493// Materialize : Idx = (add (mul vscale, NumEls), -1)
18494// i1 = extract_vector_elt t37, Constant:i64<Idx>
18495// ... into: "ptrue p, all" + PTEST
18496static SDValue
18499 const AArch64Subtarget *Subtarget) {
18500 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18501 // Make sure PTEST is legal types.
18502 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18503 return SDValue();
18504
18505 SDValue N0 = N->getOperand(0);
18506 EVT OpVT = N0.getValueType();
18507
18508 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18509 return SDValue();
18510
18511 // Idx == (add (mul vscale, NumEls), -1)
18512 SDValue Idx = N->getOperand(1);
18513 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18514 return SDValue();
18515
18516 SDValue VS = Idx.getOperand(0);
18517 if (VS.getOpcode() != ISD::VSCALE)
18518 return SDValue();
18519
18520 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18521 if (VS.getConstantOperandVal(0) != NumEls)
18522 return SDValue();
18523
18524 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18525 SelectionDAG &DAG = DCI.DAG;
18526 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18527 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18528}
18529
18530static SDValue
18532 const AArch64Subtarget *Subtarget) {
18533 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18534 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18535 return Res;
18536 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18537 return Res;
18538
18539 SelectionDAG &DAG = DCI.DAG;
18540 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18541
18542 EVT VT = N->getValueType(0);
18543 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18544 bool IsStrict = N0->isStrictFPOpcode();
18545
18546 // extract(dup x) -> x
18547 if (N0.getOpcode() == AArch64ISD::DUP)
18548 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18549 : N0.getOperand(0);
18550
18551 // Rewrite for pairwise fadd pattern
18552 // (f32 (extract_vector_elt
18553 // (fadd (vXf32 Other)
18554 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18555 // ->
18556 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18557 // (extract_vector_elt (vXf32 Other) 1))
18558 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18559 // we can only do this when it's used only by the extract_vector_elt.
18560 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18561 (!IsStrict || N0.hasOneUse())) {
18562 SDLoc DL(N0);
18563 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18564 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18565
18566 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18567 SDValue Other = N00;
18568
18569 // And handle the commutative case.
18570 if (!Shuffle) {
18571 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18572 Other = N01;
18573 }
18574
18575 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18576 Other == Shuffle->getOperand(0)) {
18577 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18578 DAG.getConstant(0, DL, MVT::i64));
18579 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18580 DAG.getConstant(1, DL, MVT::i64));
18581 if (!IsStrict)
18582 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18583
18584 // For strict_fadd we need uses of the final extract_vector to be replaced
18585 // with the strict_fadd, but we also need uses of the chain output of the
18586 // original strict_fadd to use the chain output of the new strict_fadd as
18587 // otherwise it may not be deleted.
18588 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18589 {VT, MVT::Other},
18590 {N0->getOperand(0), Extract1, Extract2});
18591 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18592 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18593 return SDValue(N, 0);
18594 }
18595 }
18596
18597 return SDValue();
18598}
18599
18602 SelectionDAG &DAG) {
18603 SDLoc dl(N);
18604 EVT VT = N->getValueType(0);
18605 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18606 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18607
18608 if (VT.isScalableVector())
18609 return SDValue();
18610
18611 // Optimize concat_vectors of truncated vectors, where the intermediate
18612 // type is illegal, to avoid said illegality, e.g.,
18613 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18614 // (v2i16 (truncate (v2i64)))))
18615 // ->
18616 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18617 // (v4i32 (bitcast (v2i64))),
18618 // <0, 2, 4, 6>)))
18619 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18620 // on both input and result type, so we might generate worse code.
18621 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18622 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18623 N1Opc == ISD::TRUNCATE) {
18624 SDValue N00 = N0->getOperand(0);
18625 SDValue N10 = N1->getOperand(0);
18626 EVT N00VT = N00.getValueType();
18627
18628 if (N00VT == N10.getValueType() &&
18629 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18630 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18631 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18633 for (size_t i = 0; i < Mask.size(); ++i)
18634 Mask[i] = i * 2;
18635 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18636 DAG.getVectorShuffle(
18637 MidVT, dl,
18638 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18639 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18640 }
18641 }
18642
18643 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18644 N->getOperand(0).getValueType() == MVT::v2i16 ||
18645 N->getOperand(0).getValueType() == MVT::v2i8) {
18646 EVT SrcVT = N->getOperand(0).getValueType();
18647 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18648 // loads to prevent having to go through the v4i8 load legalization that
18649 // needs to extend each element into a larger type.
18650 if (N->getNumOperands() % 2 == 0 &&
18651 all_of(N->op_values(), [SrcVT](SDValue V) {
18652 if (V.getValueType() != SrcVT)
18653 return false;
18654 if (V.isUndef())
18655 return true;
18656 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18657 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18658 LD->getExtensionType() == ISD::NON_EXTLOAD;
18659 })) {
18660 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18661 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
18663
18664 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18665 SDValue V = N->getOperand(i);
18666 if (V.isUndef())
18667 Ops.push_back(DAG.getUNDEF(FVT));
18668 else {
18669 LoadSDNode *LD = cast<LoadSDNode>(V);
18670 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
18671 LD->getBasePtr(), LD->getMemOperand());
18672 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18673 Ops.push_back(NewLoad);
18674 }
18675 }
18676 return DAG.getBitcast(N->getValueType(0),
18677 DAG.getBuildVector(NVT, dl, Ops));
18678 }
18679 }
18680
18681 // Canonicalise concat_vectors to replace concatenations of truncated nots
18682 // with nots of concatenated truncates. This in some cases allows for multiple
18683 // redundant negations to be eliminated.
18684 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18685 // (v4i16 (truncate (not (v4i32)))))
18686 // ->
18687 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18688 // (v4i16 (truncate (v4i32)))))
18689 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18690 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18691 N->isOnlyUserOf(N1.getNode())) {
18692 auto isBitwiseVectorNegate = [](SDValue V) {
18693 return V->getOpcode() == ISD::XOR &&
18694 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18695 };
18696 SDValue N00 = N0->getOperand(0);
18697 SDValue N10 = N1->getOperand(0);
18698 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18699 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18700 return DAG.getNOT(
18701 dl,
18702 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18703 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18704 N00->getOperand(0)),
18705 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18706 N10->getOperand(0))),
18707 VT);
18708 }
18709 }
18710
18711 // Wait till after everything is legalized to try this. That way we have
18712 // legal vector types and such.
18713 if (DCI.isBeforeLegalizeOps())
18714 return SDValue();
18715
18716 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
18717 // destination size, combine into an avg of two contacts of the source
18718 // vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
18719 // concat(b, d))
18720 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18721 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
18722 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
18723 N0->hasOneUse() && N1->hasOneUse()) {
18724 SDValue N00 = N0->getOperand(0);
18725 SDValue N01 = N0->getOperand(1);
18726 SDValue N10 = N1->getOperand(0);
18727 SDValue N11 = N1->getOperand(1);
18728
18729 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18730 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18731 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18732 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18733 }
18734 }
18735
18736 auto IsRSHRN = [](SDValue Shr) {
18737 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18738 return false;
18739 SDValue Op = Shr.getOperand(0);
18740 EVT VT = Op.getValueType();
18741 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18742 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18743 return false;
18744
18745 APInt Imm;
18746 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18747 Imm = APInt(VT.getScalarSizeInBits(),
18748 Op.getOperand(1).getConstantOperandVal(0)
18749 << Op.getOperand(1).getConstantOperandVal(1));
18750 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18751 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18752 Imm = APInt(VT.getScalarSizeInBits(),
18753 Op.getOperand(1).getConstantOperandVal(0));
18754 else
18755 return false;
18756
18757 if (Imm != 1ULL << (ShtAmt - 1))
18758 return false;
18759 return true;
18760 };
18761
18762 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18763 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18764 ((IsRSHRN(N1) &&
18766 N1.isUndef())) {
18767 SDValue X = N0.getOperand(0).getOperand(0);
18768 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18769 : N1.getOperand(0).getOperand(0);
18770 EVT BVT =
18771 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18772 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18773 SDValue Add = DAG.getNode(
18774 ISD::ADD, dl, BVT, CC,
18775 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18776 SDValue Shr =
18777 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18778 return Shr;
18779 }
18780
18781 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18782 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18783 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18784 N0.getOperand(1) == N1.getOperand(1)) {
18785 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18786 DAG.getUNDEF(N0.getValueType()));
18787 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18788 DAG.getUNDEF(N0.getValueType()));
18789 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18790 }
18791
18792 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18793 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18794 // canonicalise to that.
18795 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18796 assert(VT.getScalarSizeInBits() == 64);
18797 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18798 DAG.getConstant(0, dl, MVT::i64));
18799 }
18800
18801 // Canonicalise concat_vectors so that the right-hand vector has as few
18802 // bit-casts as possible before its real operation. The primary matching
18803 // destination for these operations will be the narrowing "2" instructions,
18804 // which depend on the operation being performed on this right-hand vector.
18805 // For example,
18806 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18807 // becomes
18808 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18809
18810 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18811 return SDValue();
18812 SDValue RHS = N1->getOperand(0);
18813 MVT RHSTy = RHS.getValueType().getSimpleVT();
18814 // If the RHS is not a vector, this is not the pattern we're looking for.
18815 if (!RHSTy.isVector())
18816 return SDValue();
18817
18818 LLVM_DEBUG(
18819 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18820
18821 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18822 RHSTy.getVectorNumElements() * 2);
18823 return DAG.getNode(ISD::BITCAST, dl, VT,
18824 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18825 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18826 RHS));
18827}
18828
18829static SDValue
18831 SelectionDAG &DAG) {
18832 if (DCI.isBeforeLegalizeOps())
18833 return SDValue();
18834
18835 EVT VT = N->getValueType(0);
18836 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18837 return SDValue();
18838
18839 SDValue V = N->getOperand(0);
18840
18841 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18842 // blocks this combine because the non-const case requires custom lowering.
18843 //
18844 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18845 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18846 if (isa<ConstantSDNode>(V.getOperand(0)))
18847 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18848
18849 return SDValue();
18850}
18851
18852static SDValue
18854 SelectionDAG &DAG) {
18855 SDLoc DL(N);
18856 SDValue Vec = N->getOperand(0);
18857 SDValue SubVec = N->getOperand(1);
18858 uint64_t IdxVal = N->getConstantOperandVal(2);
18859 EVT VecVT = Vec.getValueType();
18860 EVT SubVT = SubVec.getValueType();
18861
18862 // Only do this for legal fixed vector types.
18863 if (!VecVT.isFixedLengthVector() ||
18864 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18865 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18866 return SDValue();
18867
18868 // Ignore widening patterns.
18869 if (IdxVal == 0 && Vec.isUndef())
18870 return SDValue();
18871
18872 // Subvector must be half the width and an "aligned" insertion.
18873 unsigned NumSubElts = SubVT.getVectorNumElements();
18874 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18875 (IdxVal != 0 && IdxVal != NumSubElts))
18876 return SDValue();
18877
18878 // Fold insert_subvector -> concat_vectors
18879 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18880 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18881 SDValue Lo, Hi;
18882 if (IdxVal == 0) {
18883 Lo = SubVec;
18884 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18885 DAG.getVectorIdxConstant(NumSubElts, DL));
18886 } else {
18887 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18888 DAG.getVectorIdxConstant(0, DL));
18889 Hi = SubVec;
18890 }
18891 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18892}
18893
18896 SelectionDAG &DAG) {
18897 // Wait until after everything is legalized to try this. That way we have
18898 // legal vector types and such.
18899 if (DCI.isBeforeLegalizeOps())
18900 return SDValue();
18901 // Transform a scalar conversion of a value from a lane extract into a
18902 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18903 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18904 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18905 //
18906 // The second form interacts better with instruction selection and the
18907 // register allocator to avoid cross-class register copies that aren't
18908 // coalescable due to a lane reference.
18909
18910 // Check the operand and see if it originates from a lane extract.
18911 SDValue Op1 = N->getOperand(1);
18913 return SDValue();
18914
18915 // Yep, no additional predication needed. Perform the transform.
18916 SDValue IID = N->getOperand(0);
18917 SDValue Shift = N->getOperand(2);
18918 SDValue Vec = Op1.getOperand(0);
18919 SDValue Lane = Op1.getOperand(1);
18920 EVT ResTy = N->getValueType(0);
18921 EVT VecResTy;
18922 SDLoc DL(N);
18923
18924 // The vector width should be 128 bits by the time we get here, even
18925 // if it started as 64 bits (the extract_vector handling will have
18926 // done so). Bail if it is not.
18927 if (Vec.getValueSizeInBits() != 128)
18928 return SDValue();
18929
18930 if (Vec.getValueType() == MVT::v4i32)
18931 VecResTy = MVT::v4f32;
18932 else if (Vec.getValueType() == MVT::v2i64)
18933 VecResTy = MVT::v2f64;
18934 else
18935 return SDValue();
18936
18937 SDValue Convert =
18938 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18939 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18940}
18941
18942// AArch64 high-vector "long" operations are formed by performing the non-high
18943// version on an extract_subvector of each operand which gets the high half:
18944//
18945// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18946//
18947// However, there are cases which don't have an extract_high explicitly, but
18948// have another operation that can be made compatible with one for free. For
18949// example:
18950//
18951// (dupv64 scalar) --> (extract_high (dup128 scalar))
18952//
18953// This routine does the actual conversion of such DUPs, once outer routines
18954// have determined that everything else is in order.
18955// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18956// similarly here.
18958 MVT VT = N.getSimpleValueType();
18959 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18960 N.getConstantOperandVal(1) == 0)
18961 N = N.getOperand(0);
18962
18963 switch (N.getOpcode()) {
18964 case AArch64ISD::DUP:
18969 case AArch64ISD::MOVI:
18975 break;
18976 default:
18977 // FMOV could be supported, but isn't very useful, as it would only occur
18978 // if you passed a bitcast' floating point immediate to an eligible long
18979 // integer op (addl, smull, ...).
18980 return SDValue();
18981 }
18982
18983 if (!VT.is64BitVector())
18984 return SDValue();
18985
18986 SDLoc DL(N);
18987 unsigned NumElems = VT.getVectorNumElements();
18988 if (N.getValueType().is64BitVector()) {
18989 MVT ElementTy = VT.getVectorElementType();
18990 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
18991 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
18992 }
18993
18994 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18995 DAG.getConstant(NumElems, DL, MVT::i64));
18996}
18997
18999 if (N.getOpcode() == ISD::BITCAST)
19000 N = N.getOperand(0);
19001 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19002 return false;
19003 if (N.getOperand(0).getValueType().isScalableVector())
19004 return false;
19005 return N.getConstantOperandAPInt(1) ==
19006 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19007}
19008
19009/// Helper structure to keep track of ISD::SET_CC operands.
19014};
19015
19016/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19018 const SDValue *Cmp;
19020};
19021
19022/// Helper structure to keep track of SetCC information.
19026};
19027
19028/// Helper structure to be able to read SetCC information. If set to
19029/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19030/// GenericSetCCInfo.
19034};
19035
19036/// Check whether or not \p Op is a SET_CC operation, either a generic or
19037/// an
19038/// AArch64 lowered one.
19039/// \p SetCCInfo is filled accordingly.
19040/// \post SetCCInfo is meanginfull only when this function returns true.
19041/// \return True when Op is a kind of SET_CC operation.
19043 // If this is a setcc, this is straight forward.
19044 if (Op.getOpcode() == ISD::SETCC) {
19045 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19046 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19047 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19048 SetCCInfo.IsAArch64 = false;
19049 return true;
19050 }
19051 // Otherwise, check if this is a matching csel instruction.
19052 // In other words:
19053 // - csel 1, 0, cc
19054 // - csel 0, 1, !cc
19055 if (Op.getOpcode() != AArch64ISD::CSEL)
19056 return false;
19057 // Set the information about the operands.
19058 // TODO: we want the operands of the Cmp not the csel
19059 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19060 SetCCInfo.IsAArch64 = true;
19061 SetCCInfo.Info.AArch64.CC =
19062 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19063
19064 // Check that the operands matches the constraints:
19065 // (1) Both operands must be constants.
19066 // (2) One must be 1 and the other must be 0.
19067 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19068 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19069
19070 // Check (1).
19071 if (!TValue || !FValue)
19072 return false;
19073
19074 // Check (2).
19075 if (!TValue->isOne()) {
19076 // Update the comparison when we are interested in !cc.
19077 std::swap(TValue, FValue);
19078 SetCCInfo.Info.AArch64.CC =
19080 }
19081 return TValue->isOne() && FValue->isZero();
19082}
19083
19084// Returns true if Op is setcc or zext of setcc.
19085static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19086 if (isSetCC(Op, Info))
19087 return true;
19088 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19089 isSetCC(Op->getOperand(0), Info));
19090}
19091
19092// The folding we want to perform is:
19093// (add x, [zext] (setcc cc ...) )
19094// -->
19095// (csel x, (add x, 1), !cc ...)
19096//
19097// The latter will get matched to a CSINC instruction.
19099 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19100 SDValue LHS = Op->getOperand(0);
19101 SDValue RHS = Op->getOperand(1);
19102 SetCCInfoAndKind InfoAndKind;
19103
19104 // If both operands are a SET_CC, then we don't want to perform this
19105 // folding and create another csel as this results in more instructions
19106 // (and higher register usage).
19107 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19108 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19109 return SDValue();
19110
19111 // If neither operand is a SET_CC, give up.
19112 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19113 std::swap(LHS, RHS);
19114 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19115 return SDValue();
19116 }
19117
19118 // FIXME: This could be generatized to work for FP comparisons.
19119 EVT CmpVT = InfoAndKind.IsAArch64
19120 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19121 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19122 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19123 return SDValue();
19124
19125 SDValue CCVal;
19126 SDValue Cmp;
19127 SDLoc dl(Op);
19128 if (InfoAndKind.IsAArch64) {
19129 CCVal = DAG.getConstant(
19131 MVT::i32);
19132 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19133 } else
19134 Cmp = getAArch64Cmp(
19135 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19136 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19137 dl);
19138
19139 EVT VT = Op->getValueType(0);
19140 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19141 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19142}
19143
19144// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19146 EVT VT = N->getValueType(0);
19147 // Only scalar integer and vector types.
19148 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19149 return SDValue();
19150
19151 SDValue LHS = N->getOperand(0);
19152 SDValue RHS = N->getOperand(1);
19153 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19154 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19155 return SDValue();
19156
19157 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19158 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19159 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19160 return SDValue();
19161
19162 SDValue Op1 = LHS->getOperand(0);
19163 SDValue Op2 = RHS->getOperand(0);
19164 EVT OpVT1 = Op1.getValueType();
19165 EVT OpVT2 = Op2.getValueType();
19166 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19167 Op2.getOpcode() != AArch64ISD::UADDV ||
19168 OpVT1.getVectorElementType() != VT)
19169 return SDValue();
19170
19171 SDValue Val1 = Op1.getOperand(0);
19172 SDValue Val2 = Op2.getOperand(0);
19173 EVT ValVT = Val1->getValueType(0);
19174 SDLoc DL(N);
19175 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19176 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19177 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19178 DAG.getConstant(0, DL, MVT::i64));
19179}
19180
19181/// Perform the scalar expression combine in the form of:
19182/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19183/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19185 EVT VT = N->getValueType(0);
19186 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19187 return SDValue();
19188
19189 SDValue LHS = N->getOperand(0);
19190 SDValue RHS = N->getOperand(1);
19191
19192 // Handle commutivity.
19193 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19194 LHS.getOpcode() != AArch64ISD::CSNEG) {
19195 std::swap(LHS, RHS);
19196 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19197 LHS.getOpcode() != AArch64ISD::CSNEG) {
19198 return SDValue();
19199 }
19200 }
19201
19202 if (!LHS.hasOneUse())
19203 return SDValue();
19204
19205 AArch64CC::CondCode AArch64CC =
19206 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19207
19208 // The CSEL should include a const one operand, and the CSNEG should include
19209 // One or NegOne operand.
19210 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19211 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19212 if (!CTVal || !CFVal)
19213 return SDValue();
19214
19215 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19216 (CTVal->isOne() || CFVal->isOne())) &&
19217 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19218 (CTVal->isOne() || CFVal->isAllOnes())))
19219 return SDValue();
19220
19221 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19222 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19223 !CFVal->isOne()) {
19224 std::swap(CTVal, CFVal);
19225 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19226 }
19227
19228 SDLoc DL(N);
19229 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19230 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19231 !CFVal->isAllOnes()) {
19232 APInt C = -1 * CFVal->getAPIntValue();
19233 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19234 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19235 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19236 }
19237
19238 // It might be neutral for larger constants, as the immediate need to be
19239 // materialized in a register.
19240 APInt ADDC = CTVal->getAPIntValue();
19241 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19242 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19243 return SDValue();
19244
19245 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19246 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19247 "Unexpected constant value");
19248
19249 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19250 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19251 SDValue Cmp = LHS.getOperand(3);
19252
19253 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19254}
19255
19256// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19258 EVT VT = N->getValueType(0);
19259 if (N->getOpcode() != ISD::ADD)
19260 return SDValue();
19261
19262 SDValue Dot = N->getOperand(0);
19263 SDValue A = N->getOperand(1);
19264 // Handle commutivity
19265 auto isZeroDot = [](SDValue Dot) {
19266 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19267 Dot.getOpcode() == AArch64ISD::SDOT) &&
19269 };
19270 if (!isZeroDot(Dot))
19271 std::swap(Dot, A);
19272 if (!isZeroDot(Dot))
19273 return SDValue();
19274
19275 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19276 Dot.getOperand(2));
19277}
19278
19280 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19281}
19282
19284 SDLoc DL(Op);
19285 EVT VT = Op.getValueType();
19286 SDValue Zero = DAG.getConstant(0, DL, VT);
19287 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19288}
19289
19290// Try to fold
19291//
19292// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19293//
19294// The folding helps csel to be matched with csneg without generating
19295// redundant neg instruction, which includes negation of the csel expansion
19296// of abs node lowered by lowerABS.
19298 if (!isNegatedInteger(SDValue(N, 0)))
19299 return SDValue();
19300
19301 SDValue CSel = N->getOperand(1);
19302 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19303 return SDValue();
19304
19305 SDValue N0 = CSel.getOperand(0);
19306 SDValue N1 = CSel.getOperand(1);
19307
19308 // If both of them is not negations, it's not worth the folding as it
19309 // introduces two additional negations while reducing one negation.
19310 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19311 return SDValue();
19312
19313 SDValue N0N = getNegatedInteger(N0, DAG);
19314 SDValue N1N = getNegatedInteger(N1, DAG);
19315
19316 SDLoc DL(N);
19317 EVT VT = CSel.getValueType();
19318 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19319 CSel.getOperand(3));
19320}
19321
19322// The basic add/sub long vector instructions have variants with "2" on the end
19323// which act on the high-half of their inputs. They are normally matched by
19324// patterns like:
19325//
19326// (add (zeroext (extract_high LHS)),
19327// (zeroext (extract_high RHS)))
19328// -> uaddl2 vD, vN, vM
19329//
19330// However, if one of the extracts is something like a duplicate, this
19331// instruction can still be used profitably. This function puts the DAG into a
19332// more appropriate form for those patterns to trigger.
19335 SelectionDAG &DAG = DCI.DAG;
19336 if (DCI.isBeforeLegalizeOps())
19337 return SDValue();
19338
19339 MVT VT = N->getSimpleValueType(0);
19340 if (!VT.is128BitVector()) {
19341 if (N->getOpcode() == ISD::ADD)
19342 return performSetccAddFolding(N, DAG);
19343 return SDValue();
19344 }
19345
19346 // Make sure both branches are extended in the same way.
19347 SDValue LHS = N->getOperand(0);
19348 SDValue RHS = N->getOperand(1);
19349 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19350 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19351 LHS.getOpcode() != RHS.getOpcode())
19352 return SDValue();
19353
19354 unsigned ExtType = LHS.getOpcode();
19355
19356 // It's not worth doing if at least one of the inputs isn't already an
19357 // extract, but we don't know which it'll be so we have to try both.
19358 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19359 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19360 if (!RHS.getNode())
19361 return SDValue();
19362
19363 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19364 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19365 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19366 if (!LHS.getNode())
19367 return SDValue();
19368
19369 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19370 }
19371
19372 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19373}
19374
19375static bool isCMP(SDValue Op) {
19376 return Op.getOpcode() == AArch64ISD::SUBS &&
19377 !Op.getNode()->hasAnyUseOfValue(0);
19378}
19379
19380// (CSEL 1 0 CC Cond) => CC
19381// (CSEL 0 1 CC Cond) => !CC
19382static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19383 if (Op.getOpcode() != AArch64ISD::CSEL)
19384 return std::nullopt;
19385 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19386 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19387 return std::nullopt;
19388 SDValue OpLHS = Op.getOperand(0);
19389 SDValue OpRHS = Op.getOperand(1);
19390 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19391 return CC;
19392 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19393 return getInvertedCondCode(CC);
19394
19395 return std::nullopt;
19396}
19397
19398// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19399// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19400static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19401 SDValue CmpOp = Op->getOperand(2);
19402 if (!isCMP(CmpOp))
19403 return SDValue();
19404
19405 if (IsAdd) {
19406 if (!isOneConstant(CmpOp.getOperand(1)))
19407 return SDValue();
19408 } else {
19409 if (!isNullConstant(CmpOp.getOperand(0)))
19410 return SDValue();
19411 }
19412
19413 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19414 auto CC = getCSETCondCode(CsetOp);
19415 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19416 return SDValue();
19417
19418 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19419 Op->getOperand(0), Op->getOperand(1),
19420 CsetOp.getOperand(3));
19421}
19422
19423// (ADC x 0 cond) => (CINC x HS cond)
19425 SDValue LHS = N->getOperand(0);
19426 SDValue RHS = N->getOperand(1);
19427 SDValue Cond = N->getOperand(2);
19428
19429 if (!isNullConstant(RHS))
19430 return SDValue();
19431
19432 EVT VT = N->getValueType(0);
19433 SDLoc DL(N);
19434
19435 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19436 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19437 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19438}
19439
19440// Transform vector add(zext i8 to i32, zext i8 to i32)
19441// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19442// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19443// extends.
19445 EVT VT = N->getValueType(0);
19446 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19447 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19448 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19449 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19450 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19451 N->getOperand(0).getOperand(0).getValueType() !=
19452 N->getOperand(1).getOperand(0).getValueType())
19453 return SDValue();
19454
19455 SDValue N0 = N->getOperand(0).getOperand(0);
19456 SDValue N1 = N->getOperand(1).getOperand(0);
19457 EVT InVT = N0.getValueType();
19458
19459 EVT S1 = InVT.getScalarType();
19460 EVT S2 = VT.getScalarType();
19461 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19462 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19463 SDLoc DL(N);
19464 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19467 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19468 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19469 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19470 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19471 }
19472 return SDValue();
19473}
19474
19477 SelectionDAG &DAG) {
19478 SDLoc DL(N);
19479 EVT VT = N->getValueType(0);
19480
19481 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19482 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19483 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19484 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19485 Elt1->getOpcode() == ISD::FP_ROUND &&
19486 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19487 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19488 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19490 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19491 // Constant index.
19492 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19493 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19494 Elt0->getOperand(0)->getOperand(0) ==
19495 Elt1->getOperand(0)->getOperand(0) &&
19496 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19497 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19498 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19499 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19500 SDValue HighLanes;
19501 if (Elt2->getOpcode() == ISD::UNDEF &&
19502 Elt3->getOpcode() == ISD::UNDEF) {
19503 HighLanes = DAG.getUNDEF(MVT::v2f32);
19504 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19505 Elt3->getOpcode() == ISD::FP_ROUND &&
19506 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19507 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19508 Elt2->getConstantOperandVal(1) ==
19509 Elt3->getConstantOperandVal(1) &&
19510 Elt2->getOperand(0)->getOpcode() ==
19512 Elt3->getOperand(0)->getOpcode() ==
19514 // Constant index.
19515 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19516 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19517 Elt2->getOperand(0)->getOperand(0) ==
19518 Elt3->getOperand(0)->getOperand(0) &&
19519 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19520 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19521 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19522 HighLanes =
19523 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19524 }
19525 if (HighLanes) {
19526 SDValue DoubleToSingleSticky =
19527 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19528 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19529 DoubleToSingleSticky, HighLanes);
19530 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19531 Elt0->getOperand(1));
19532 }
19533 }
19534 }
19535 }
19536
19537 if (VT == MVT::v2f64) {
19538 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19539 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19540 Elt1->getOpcode() == ISD::FP_EXTEND &&
19542 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19543 Elt0->getOperand(0)->getOperand(0) ==
19544 Elt1->getOperand(0)->getOperand(0) &&
19545 // Constant index.
19546 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19547 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19548 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19549 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19550 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19551 // ResultType's known minimum vector length.
19552 Elt0->getOperand(0)->getConstantOperandVal(1) %
19554 0) {
19555 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19556 if (SrcVec.getValueType() == MVT::v4f16 ||
19557 SrcVec.getValueType() == MVT::v4bf16) {
19558 SDValue HalfToSingle =
19559 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19560 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19561 SDValue Extract = DAG.getNode(
19563 HalfToSingle, SubvectorIdx);
19564 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19565 }
19566 }
19567 }
19568
19569 // A build vector of two extracted elements is equivalent to an
19570 // extract subvector where the inner vector is any-extended to the
19571 // extract_vector_elt VT.
19572 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19573 // (extract_elt_iXX_to_i32 vec Idx+1))
19574 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19575
19576 // For now, only consider the v2i32 case, which arises as a result of
19577 // legalization.
19578 if (VT != MVT::v2i32)
19579 return SDValue();
19580
19581 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19582 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19583 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19584 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19585 // Constant index.
19586 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19587 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19588 // Both EXTRACT_VECTOR_ELT from same vector...
19589 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19590 // ... and contiguous. First element's index +1 == second element's index.
19591 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19592 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19593 // ResultType's known minimum vector length.
19594 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19595 SDValue VecToExtend = Elt0->getOperand(0);
19596 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19597 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19598 return SDValue();
19599
19600 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19601
19602 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19603 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19604 SubvectorIdx);
19605 }
19606
19607 return SDValue();
19608}
19609
19611 SelectionDAG &DAG) {
19612 EVT VT = N->getValueType(0);
19613 SDValue N0 = N->getOperand(0);
19614 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19615 N0.getOpcode() == AArch64ISD::DUP) {
19616 SDValue Op = N0.getOperand(0);
19617 if (VT.getScalarType() == MVT::i32 &&
19618 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19619 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19620 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19621 }
19622
19623 return SDValue();
19624}
19625
19626// Check an node is an extend or shift operand
19628 unsigned Opcode = N.getOpcode();
19629 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19630 EVT SrcVT;
19631 if (Opcode == ISD::SIGN_EXTEND_INREG)
19632 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19633 else
19634 SrcVT = N.getOperand(0).getValueType();
19635
19636 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19637 } else if (Opcode == ISD::AND) {
19638 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19639 if (!CSD)
19640 return false;
19641 uint64_t AndMask = CSD->getZExtValue();
19642 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19643 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19644 return isa<ConstantSDNode>(N.getOperand(1));
19645 }
19646
19647 return false;
19648}
19649
19650// (N - Y) + Z --> (Z - Y) + N
19651// when N is an extend or shift operand
19653 SelectionDAG &DAG) {
19654 auto IsOneUseExtend = [](SDValue N) {
19655 return N.hasOneUse() && isExtendOrShiftOperand(N);
19656 };
19657
19658 // DAGCombiner will revert the combination when Z is constant cause
19659 // dead loop. So don't enable the combination when Z is constant.
19660 // If Z is one use shift C, we also can't do the optimization.
19661 // It will falling to self infinite loop.
19662 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19663 return SDValue();
19664
19665 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19666 return SDValue();
19667
19668 SDValue Shift = SUB.getOperand(0);
19669 if (!IsOneUseExtend(Shift))
19670 return SDValue();
19671
19672 SDLoc DL(N);
19673 EVT VT = N->getValueType(0);
19674
19675 SDValue Y = SUB.getOperand(1);
19676 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19677 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19678}
19679
19681 SelectionDAG &DAG) {
19682 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19683 // commutative.
19684 if (N->getOpcode() != ISD::ADD)
19685 return SDValue();
19686
19687 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19688 // shifted register is only available for i32 and i64.
19689 EVT VT = N->getValueType(0);
19690 if (VT != MVT::i32 && VT != MVT::i64)
19691 return SDValue();
19692
19693 SDLoc DL(N);
19694 SDValue LHS = N->getOperand(0);
19695 SDValue RHS = N->getOperand(1);
19696
19697 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19698 return Val;
19699 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19700 return Val;
19701
19702 uint64_t LHSImm = 0, RHSImm = 0;
19703 // If both operand are shifted by imm and shift amount is not greater than 4
19704 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19705 // on RHS.
19706 //
19707 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19708 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19709 // with LSL (shift > 4). For the rest of processors, this is no-op for
19710 // performance or correctness.
19711 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19712 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19713 RHSImm > 4 && LHS.hasOneUse())
19714 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19715
19716 return SDValue();
19717}
19718
19719// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19720// This reassociates it back to allow the creation of more mls instructions.
19722 if (N->getOpcode() != ISD::SUB)
19723 return SDValue();
19724
19725 SDValue Add = N->getOperand(1);
19726 SDValue X = N->getOperand(0);
19727 if (Add.getOpcode() != ISD::ADD)
19728 return SDValue();
19729
19730 if (!Add.hasOneUse())
19731 return SDValue();
19733 return SDValue();
19734
19735 SDValue M1 = Add.getOperand(0);
19736 SDValue M2 = Add.getOperand(1);
19737 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19738 M1.getOpcode() != AArch64ISD::UMULL)
19739 return SDValue();
19740 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19742 return SDValue();
19743
19744 EVT VT = N->getValueType(0);
19745 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19746 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19747}
19748
19749// Combine into mla/mls.
19750// This works on the patterns of:
19751// add v1, (mul v2, v3)
19752// sub v1, (mul v2, v3)
19753// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19754// It will transform the add/sub to a scalable version, so that we can
19755// make use of SVE's MLA/MLS that will be generated for that pattern
19756static SDValue
19758 SelectionDAG &DAG = DCI.DAG;
19759 // Make sure that the types are legal
19760 if (!DCI.isAfterLegalizeDAG())
19761 return SDValue();
19762 // Before using SVE's features, check first if it's available.
19763 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19764 return SDValue();
19765
19766 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19767 return SDValue();
19768
19769 if (!N->getValueType(0).isFixedLengthVector())
19770 return SDValue();
19771
19772 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19773 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19774 return SDValue();
19775
19776 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19777 return SDValue();
19778
19779 SDValue MulValue = Op1->getOperand(0);
19780 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19781 return SDValue();
19782
19783 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19784 return SDValue();
19785
19786 EVT ScalableVT = MulValue.getValueType();
19787 if (!ScalableVT.isScalableVector())
19788 return SDValue();
19789
19790 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19791 SDValue NewValue =
19792 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19793 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19794 };
19795
19796 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19797 return res;
19798 else if (N->getOpcode() == ISD::ADD)
19799 return performOpt(N->getOperand(1), N->getOperand(0));
19800
19801 return SDValue();
19802}
19803
19804// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19805// help, for example, to produce ssra from sshr+add.
19807 EVT VT = N->getValueType(0);
19808 if (VT != MVT::i64)
19809 return SDValue();
19810 SDValue Op0 = N->getOperand(0);
19811 SDValue Op1 = N->getOperand(1);
19812
19813 // At least one of the operands should be an extract, and the other should be
19814 // something that is easy to convert to v1i64 type (in this case a load).
19815 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19816 Op0.getOpcode() != ISD::LOAD)
19817 return SDValue();
19818 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19819 Op1.getOpcode() != ISD::LOAD)
19820 return SDValue();
19821
19822 SDLoc DL(N);
19823 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19824 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19825 Op0 = Op0.getOperand(0);
19826 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19827 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19828 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19829 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19830 Op1 = Op1.getOperand(0);
19831 } else
19832 return SDValue();
19833
19834 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19835 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19836 DAG.getConstant(0, DL, MVT::i64));
19837}
19838
19841 if (!BV->hasOneUse())
19842 return false;
19843 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19844 if (!Ld || !Ld->isSimple())
19845 return false;
19846 Loads.push_back(Ld);
19847 return true;
19848 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19850 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19851 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19852 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19853 return false;
19854 Loads.push_back(Ld);
19855 }
19856 return true;
19857 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19858 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19859 // are lowered. Note that this only comes up because we do not always visit
19860 // operands before uses. After that is fixed this can be removed and in the
19861 // meantime this is fairly specific to the lowering we expect from IR.
19862 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19863 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19864 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19865 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19866 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19867 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19868 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19869 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19870 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19871 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19872 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19873 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19874 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19875 B.getOperand(1).getNumOperands() != 4)
19876 return false;
19877 auto SV1 = cast<ShuffleVectorSDNode>(B);
19878 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19879 int NumElts = B.getValueType().getVectorNumElements();
19880 int NumSubElts = NumElts / 4;
19881 for (int I = 0; I < NumSubElts; I++) {
19882 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19883 if (SV1->getMaskElt(I) != I ||
19884 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19885 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19886 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19887 return false;
19888 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19889 if (SV2->getMaskElt(I) != I ||
19890 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19891 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19892 return false;
19893 }
19894 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19895 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19896 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19897 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19898 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19899 !Ld2->isSimple() || !Ld3->isSimple())
19900 return false;
19901 Loads.push_back(Ld0);
19902 Loads.push_back(Ld1);
19903 Loads.push_back(Ld2);
19904 Loads.push_back(Ld3);
19905 return true;
19906 }
19907 return false;
19908}
19909
19911 SelectionDAG &DAG,
19912 unsigned &NumSubLoads) {
19913 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19914 return false;
19915
19916 SmallVector<LoadSDNode *> Loads0, Loads1;
19917 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19918 isLoadOrMultipleLoads(Op1, Loads1)) {
19919 if (NumSubLoads && Loads0.size() != NumSubLoads)
19920 return false;
19921 NumSubLoads = Loads0.size();
19922 return Loads0.size() == Loads1.size() &&
19923 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19924 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19925 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19926 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19927 Size / 8, 1);
19928 });
19929 }
19930
19931 if (Op0.getOpcode() != Op1.getOpcode())
19932 return false;
19933
19934 switch (Op0.getOpcode()) {
19935 case ISD::ADD:
19936 case ISD::SUB:
19938 DAG, NumSubLoads) &&
19940 DAG, NumSubLoads);
19941 case ISD::SIGN_EXTEND:
19942 case ISD::ANY_EXTEND:
19943 case ISD::ZERO_EXTEND:
19944 EVT XVT = Op0.getOperand(0).getValueType();
19945 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19946 XVT.getScalarSizeInBits() != 32)
19947 return false;
19949 DAG, NumSubLoads);
19950 }
19951 return false;
19952}
19953
19954// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19955// into a single load of twice the size, that we extract the bottom part and top
19956// part so that the shl can use a shll2 instruction. The two loads in that
19957// example can also be larger trees of instructions, which are identical except
19958// for the leaves which are all loads offset from the LHS, including
19959// buildvectors of multiple loads. For example the RHS tree could be
19960// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19961// Whilst it can be common for the larger loads to replace LDP instructions
19962// (which doesn't gain anything on it's own), the larger loads can help create
19963// more efficient code, and in buildvectors prevent the need for ld1 lane
19964// inserts which can be slower than normal loads.
19966 EVT VT = N->getValueType(0);
19967 if (!VT.isFixedLengthVector() ||
19968 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19969 VT.getScalarSizeInBits() != 64))
19970 return SDValue();
19971
19972 SDValue Other = N->getOperand(0);
19973 SDValue Shift = N->getOperand(1);
19974 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19975 std::swap(Shift, Other);
19976 APInt ShiftAmt;
19977 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19978 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
19979 return SDValue();
19980
19981 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
19982 !ISD::isExtOpcode(Other.getOpcode()) ||
19983 Shift.getOperand(0).getOperand(0).getValueType() !=
19984 Other.getOperand(0).getValueType() ||
19985 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
19986 return SDValue();
19987
19988 SDValue Op0 = Other.getOperand(0);
19989 SDValue Op1 = Shift.getOperand(0).getOperand(0);
19990
19991 unsigned NumSubLoads = 0;
19992 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19993 return SDValue();
19994
19995 // Attempt to rule out some unprofitable cases using heuristics (some working
19996 // around suboptimal code generation), notably if the extend not be able to
19997 // use ushll2 instructions as the types are not large enough. Otherwise zip's
19998 // will need to be created which can increase the instruction count.
19999 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20000 unsigned NumSubElts = NumElts / NumSubLoads;
20001 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20002 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20003 Op0.getValueType().getSizeInBits() < 128 &&
20005 return SDValue();
20006
20007 // Recreate the tree with the new combined loads.
20008 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20009 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20010 EVT DVT =
20012
20013 SmallVector<LoadSDNode *> Loads0, Loads1;
20014 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20015 isLoadOrMultipleLoads(Op1, Loads1)) {
20016 EVT LoadVT = EVT::getVectorVT(
20017 *DAG.getContext(), Op0.getValueType().getScalarType(),
20018 Op0.getValueType().getVectorNumElements() / Loads0.size());
20019 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20020
20021 SmallVector<SDValue> NewLoads;
20022 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20023 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20024 L0->getBasePtr(), L0->getPointerInfo(),
20025 L0->getOriginalAlign());
20026 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20027 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20028 NewLoads.push_back(Load);
20029 }
20030 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20031 }
20032
20034 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20035 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20036 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20037 };
20038 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20039
20040 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20041 int Hi = NumSubElts, Lo = 0;
20042 for (unsigned i = 0; i < NumSubLoads; i++) {
20043 for (unsigned j = 0; j < NumSubElts; j++) {
20044 LowMask[i * NumSubElts + j] = Lo++;
20045 HighMask[i * NumSubElts + j] = Hi++;
20046 }
20047 Lo += NumSubElts;
20048 Hi += NumSubElts;
20049 }
20050 SDLoc DL(N);
20051 SDValue Ext0, Ext1;
20052 // Extract the top and bottom lanes, then extend the result. Possibly extend
20053 // the result then extract the lanes if the two operands match as it produces
20054 // slightly smaller code.
20055 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20057 NewOp, DAG.getConstant(0, DL, MVT::i64));
20058 SDValue SubH =
20059 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20060 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20061 SDValue Extr0 =
20062 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20063 SDValue Extr1 =
20064 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20065 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20066 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20067 } else {
20069 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20070 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20071 DAG.getConstant(0, DL, MVT::i64));
20072 SDValue SubH =
20073 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20074 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20075 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20076 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20077 }
20078 SDValue NShift =
20079 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20080 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20081}
20082
20085 // Try to change sum of two reductions.
20086 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20087 return Val;
20088 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20089 return Val;
20090 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20091 return Val;
20092 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20093 return Val;
20095 return Val;
20097 return Val;
20098 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20099 return Val;
20100 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20101 return Val;
20102 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20103 return Val;
20104
20105 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20106 return Val;
20107
20108 return performAddSubLongCombine(N, DCI);
20109}
20110
20111// Massage DAGs which we can use the high-half "long" operations on into
20112// something isel will recognize better. E.g.
20113//
20114// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20115// (aarch64_neon_umull (extract_high (v2i64 vec)))
20116// (extract_high (v2i64 (dup128 scalar)))))
20117//
20120 SelectionDAG &DAG) {
20121 if (DCI.isBeforeLegalizeOps())
20122 return SDValue();
20123
20124 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20125 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20126 assert(LHS.getValueType().is64BitVector() &&
20127 RHS.getValueType().is64BitVector() &&
20128 "unexpected shape for long operation");
20129
20130 // Either node could be a DUP, but it's not worth doing both of them (you'd
20131 // just as well use the non-high version) so look for a corresponding extract
20132 // operation on the other "wing".
20135 if (!RHS.getNode())
20136 return SDValue();
20139 if (!LHS.getNode())
20140 return SDValue();
20141 } else
20142 return SDValue();
20143
20144 if (IID == Intrinsic::not_intrinsic)
20145 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20146
20147 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20148 N->getOperand(0), LHS, RHS);
20149}
20150
20151static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20152 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20153 unsigned ElemBits = ElemTy.getSizeInBits();
20154
20155 int64_t ShiftAmount;
20156 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20157 APInt SplatValue, SplatUndef;
20158 unsigned SplatBitSize;
20159 bool HasAnyUndefs;
20160 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20161 HasAnyUndefs, ElemBits) ||
20162 SplatBitSize != ElemBits)
20163 return SDValue();
20164
20165 ShiftAmount = SplatValue.getSExtValue();
20166 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20167 ShiftAmount = CVN->getSExtValue();
20168 } else
20169 return SDValue();
20170
20171 // If the shift amount is zero, remove the shift intrinsic.
20172 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20173 return N->getOperand(1);
20174
20175 unsigned Opcode;
20176 bool IsRightShift;
20177 switch (IID) {
20178 default:
20179 llvm_unreachable("Unknown shift intrinsic");
20180 case Intrinsic::aarch64_neon_sqshl:
20181 Opcode = AArch64ISD::SQSHL_I;
20182 IsRightShift = false;
20183 break;
20184 case Intrinsic::aarch64_neon_uqshl:
20185 Opcode = AArch64ISD::UQSHL_I;
20186 IsRightShift = false;
20187 break;
20188 case Intrinsic::aarch64_neon_srshl:
20189 Opcode = AArch64ISD::SRSHR_I;
20190 IsRightShift = true;
20191 break;
20192 case Intrinsic::aarch64_neon_urshl:
20193 Opcode = AArch64ISD::URSHR_I;
20194 IsRightShift = true;
20195 break;
20196 case Intrinsic::aarch64_neon_sqshlu:
20197 Opcode = AArch64ISD::SQSHLU_I;
20198 IsRightShift = false;
20199 break;
20200 case Intrinsic::aarch64_neon_sshl:
20201 case Intrinsic::aarch64_neon_ushl:
20202 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20203 // left shift for positive shift amounts. For negative shifts we can use a
20204 // VASHR/VLSHR as appropiate.
20205 if (ShiftAmount < 0) {
20206 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20208 ShiftAmount = -ShiftAmount;
20209 } else
20210 Opcode = AArch64ISD::VSHL;
20211 IsRightShift = false;
20212 break;
20213 }
20214
20215 EVT VT = N->getValueType(0);
20216 SDValue Op = N->getOperand(1);
20217 SDLoc dl(N);
20218 if (VT == MVT::i64) {
20219 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20220 VT = MVT::v1i64;
20221 }
20222
20223 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20224 Op = DAG.getNode(Opcode, dl, VT, Op,
20225 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20226 if (N->getValueType(0) == MVT::i64)
20227 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20228 DAG.getConstant(0, dl, MVT::i64));
20229 return Op;
20230 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20231 Op = DAG.getNode(Opcode, dl, VT, Op,
20232 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20233 if (N->getValueType(0) == MVT::i64)
20234 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20235 DAG.getConstant(0, dl, MVT::i64));
20236 return Op;
20237 }
20238
20239 return SDValue();
20240}
20241
20242// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20243// the intrinsics must be legal and take an i32, this means there's almost
20244// certainly going to be a zext in the DAG which we can eliminate.
20245static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20246 SDValue AndN = N->getOperand(2);
20247 if (AndN.getOpcode() != ISD::AND)
20248 return SDValue();
20249
20250 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20251 if (!CMask || CMask->getZExtValue() != Mask)
20252 return SDValue();
20253
20254 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20255 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20256}
20257
20259 SelectionDAG &DAG) {
20260 SDLoc dl(N);
20261 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20262 DAG.getNode(Opc, dl,
20263 N->getOperand(1).getSimpleValueType(),
20264 N->getOperand(1)),
20265 DAG.getConstant(0, dl, MVT::i64));
20266}
20267
20269 SDLoc DL(N);
20270 SDValue Op1 = N->getOperand(1);
20271 SDValue Op2 = N->getOperand(2);
20272 EVT ScalarTy = Op2.getValueType();
20273 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20274 ScalarTy = MVT::i32;
20275
20276 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20277 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20278 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20279 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20280 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20281 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20282}
20283
20285 SDLoc dl(N);
20286 SDValue Scalar = N->getOperand(3);
20287 EVT ScalarTy = Scalar.getValueType();
20288
20289 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20290 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20291
20292 SDValue Passthru = N->getOperand(1);
20293 SDValue Pred = N->getOperand(2);
20294 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20295 Pred, Scalar, Passthru);
20296}
20297
20299 SDLoc dl(N);
20300 LLVMContext &Ctx = *DAG.getContext();
20301 EVT VT = N->getValueType(0);
20302
20303 assert(VT.isScalableVector() && "Expected a scalable vector.");
20304
20305 // Current lowering only supports the SVE-ACLE types.
20307 return SDValue();
20308
20309 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20310 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20311 EVT ByteVT =
20312 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20313
20314 // Convert everything to the domain of EXT (i.e bytes).
20315 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20316 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20317 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20318 DAG.getConstant(ElemSize, dl, MVT::i32));
20319
20320 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20321 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20322}
20323
20326 SelectionDAG &DAG) {
20327 if (DCI.isBeforeLegalize())
20328 return SDValue();
20329
20330 SDValue Comparator = N->getOperand(3);
20331 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20332 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20333 unsigned IID = getIntrinsicID(N);
20334 EVT VT = N->getValueType(0);
20335 EVT CmpVT = N->getOperand(2).getValueType();
20336 SDValue Pred = N->getOperand(1);
20337 SDValue Imm;
20338 SDLoc DL(N);
20339
20340 switch (IID) {
20341 default:
20342 llvm_unreachable("Called with wrong intrinsic!");
20343 break;
20344
20345 // Signed comparisons
20346 case Intrinsic::aarch64_sve_cmpeq_wide:
20347 case Intrinsic::aarch64_sve_cmpne_wide:
20348 case Intrinsic::aarch64_sve_cmpge_wide:
20349 case Intrinsic::aarch64_sve_cmpgt_wide:
20350 case Intrinsic::aarch64_sve_cmplt_wide:
20351 case Intrinsic::aarch64_sve_cmple_wide: {
20352 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20353 int64_t ImmVal = CN->getSExtValue();
20354 if (ImmVal >= -16 && ImmVal <= 15)
20355 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20356 else
20357 return SDValue();
20358 }
20359 break;
20360 }
20361 // Unsigned comparisons
20362 case Intrinsic::aarch64_sve_cmphs_wide:
20363 case Intrinsic::aarch64_sve_cmphi_wide:
20364 case Intrinsic::aarch64_sve_cmplo_wide:
20365 case Intrinsic::aarch64_sve_cmpls_wide: {
20366 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20367 uint64_t ImmVal = CN->getZExtValue();
20368 if (ImmVal <= 127)
20369 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20370 else
20371 return SDValue();
20372 }
20373 break;
20374 }
20375 }
20376
20377 if (!Imm)
20378 return SDValue();
20379
20380 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20381 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20382 N->getOperand(2), Splat, DAG.getCondCode(CC));
20383 }
20384
20385 return SDValue();
20386}
20387
20390 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20391
20392 SDLoc DL(Op);
20393 assert(Op.getValueType().isScalableVector() &&
20394 TLI.isTypeLegal(Op.getValueType()) &&
20395 "Expected legal scalable vector type!");
20396 assert(Op.getValueType() == Pg.getValueType() &&
20397 "Expected same type for PTEST operands");
20398
20399 // Ensure target specific opcodes are using legal type.
20400 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20401 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20402 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20403
20404 // Ensure operands have type nxv16i1.
20405 if (Op.getValueType() != MVT::nxv16i1) {
20408 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20409 else
20410 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20411 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20412 }
20413
20414 // Set condition code (CC) flags.
20415 SDValue Test = DAG.getNode(
20417 DL, MVT::Other, Pg, Op);
20418
20419 // Convert CC to integer based on requested condition.
20420 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20421 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20422 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20423 return DAG.getZExtOrTrunc(Res, DL, VT);
20424}
20425
20427 SelectionDAG &DAG) {
20428 SDLoc DL(N);
20429
20430 SDValue Pred = N->getOperand(1);
20431 SDValue VecToReduce = N->getOperand(2);
20432
20433 // NOTE: The integer reduction's result type is not always linked to the
20434 // operand's element type so we construct it from the intrinsic's result type.
20435 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20436 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20437
20438 // SVE reductions set the whole vector register with the first element
20439 // containing the reduction result, which we'll now extract.
20440 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20441 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20442 Zero);
20443}
20444
20446 SelectionDAG &DAG) {
20447 SDLoc DL(N);
20448
20449 SDValue Pred = N->getOperand(1);
20450 SDValue VecToReduce = N->getOperand(2);
20451
20452 EVT ReduceVT = VecToReduce.getValueType();
20453 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20454
20455 // SVE reductions set the whole vector register with the first element
20456 // containing the reduction result, which we'll now extract.
20457 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20458 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20459 Zero);
20460}
20461
20463 SelectionDAG &DAG) {
20464 SDLoc DL(N);
20465
20466 SDValue Pred = N->getOperand(1);
20467 SDValue InitVal = N->getOperand(2);
20468 SDValue VecToReduce = N->getOperand(3);
20469 EVT ReduceVT = VecToReduce.getValueType();
20470
20471 // Ordered reductions use the first lane of the result vector as the
20472 // reduction's initial value.
20473 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20474 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20475 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20476
20477 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20478
20479 // SVE reductions set the whole vector register with the first element
20480 // containing the reduction result, which we'll now extract.
20481 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20482 Zero);
20483}
20484
20485// If a merged operation has no inactive lanes we can relax it to a predicated
20486// or unpredicated operation, which potentially allows better isel (perhaps
20487// using immediate forms) or relaxing register reuse requirements.
20489 SelectionDAG &DAG, bool UnpredOp = false,
20490 bool SwapOperands = false) {
20491 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20492 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20493 SDValue Pg = N->getOperand(1);
20494 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20495 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20496
20497 // ISD way to specify an all active predicate.
20498 if (isAllActivePredicate(DAG, Pg)) {
20499 if (UnpredOp)
20500 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20501
20502 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20503 }
20504
20505 // FUTURE: SplatVector(true)
20506 return SDValue();
20507}
20508
20511 const AArch64Subtarget *Subtarget) {
20512 SelectionDAG &DAG = DCI.DAG;
20513 unsigned IID = getIntrinsicID(N);
20514 switch (IID) {
20515 default:
20516 break;
20517 case Intrinsic::get_active_lane_mask: {
20518 SDValue Res = SDValue();
20519 EVT VT = N->getValueType(0);
20520 if (VT.isFixedLengthVector()) {
20521 // We can use the SVE whilelo instruction to lower this intrinsic by
20522 // creating the appropriate sequence of scalable vector operations and
20523 // then extracting a fixed-width subvector from the scalable vector.
20524
20525 SDLoc DL(N);
20526 SDValue ID =
20527 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20528
20529 EVT WhileVT = EVT::getVectorVT(
20530 *DAG.getContext(), MVT::i1,
20532
20533 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20534 EVT PromVT = getPromotedVTForPredicate(WhileVT);
20535
20536 // Get the fixed-width equivalent of PromVT for extraction.
20537 EVT ExtVT =
20540
20541 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
20542 N->getOperand(1), N->getOperand(2));
20543 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
20544 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20545 DAG.getConstant(0, DL, MVT::i64));
20546 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
20547 }
20548 return Res;
20549 }
20550 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20551 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20552 return tryCombineFixedPointConvert(N, DCI, DAG);
20553 case Intrinsic::aarch64_neon_saddv:
20555 case Intrinsic::aarch64_neon_uaddv:
20557 case Intrinsic::aarch64_neon_sminv:
20559 case Intrinsic::aarch64_neon_uminv:
20561 case Intrinsic::aarch64_neon_smaxv:
20563 case Intrinsic::aarch64_neon_umaxv:
20565 case Intrinsic::aarch64_neon_fmax:
20566 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20567 N->getOperand(1), N->getOperand(2));
20568 case Intrinsic::aarch64_neon_fmin:
20569 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20570 N->getOperand(1), N->getOperand(2));
20571 case Intrinsic::aarch64_neon_fmaxnm:
20572 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20573 N->getOperand(1), N->getOperand(2));
20574 case Intrinsic::aarch64_neon_fminnm:
20575 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20576 N->getOperand(1), N->getOperand(2));
20577 case Intrinsic::aarch64_neon_smull:
20578 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20579 N->getOperand(1), N->getOperand(2));
20580 case Intrinsic::aarch64_neon_umull:
20581 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20582 N->getOperand(1), N->getOperand(2));
20583 case Intrinsic::aarch64_neon_pmull:
20584 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20585 N->getOperand(1), N->getOperand(2));
20586 case Intrinsic::aarch64_neon_sqdmull:
20587 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20588 case Intrinsic::aarch64_neon_sqshl:
20589 case Intrinsic::aarch64_neon_uqshl:
20590 case Intrinsic::aarch64_neon_sqshlu:
20591 case Intrinsic::aarch64_neon_srshl:
20592 case Intrinsic::aarch64_neon_urshl:
20593 case Intrinsic::aarch64_neon_sshl:
20594 case Intrinsic::aarch64_neon_ushl:
20595 return tryCombineShiftImm(IID, N, DAG);
20596 case Intrinsic::aarch64_neon_sabd:
20597 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20598 N->getOperand(1), N->getOperand(2));
20599 case Intrinsic::aarch64_neon_uabd:
20600 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20601 N->getOperand(1), N->getOperand(2));
20602 case Intrinsic::aarch64_crc32b:
20603 case Intrinsic::aarch64_crc32cb:
20604 return tryCombineCRC32(0xff, N, DAG);
20605 case Intrinsic::aarch64_crc32h:
20606 case Intrinsic::aarch64_crc32ch:
20607 return tryCombineCRC32(0xffff, N, DAG);
20608 case Intrinsic::aarch64_sve_saddv:
20609 // There is no i64 version of SADDV because the sign is irrelevant.
20610 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20612 else
20614 case Intrinsic::aarch64_sve_uaddv:
20616 case Intrinsic::aarch64_sve_smaxv:
20618 case Intrinsic::aarch64_sve_umaxv:
20620 case Intrinsic::aarch64_sve_sminv:
20622 case Intrinsic::aarch64_sve_uminv:
20624 case Intrinsic::aarch64_sve_orv:
20626 case Intrinsic::aarch64_sve_eorv:
20628 case Intrinsic::aarch64_sve_andv:
20630 case Intrinsic::aarch64_sve_index:
20631 return LowerSVEIntrinsicIndex(N, DAG);
20632 case Intrinsic::aarch64_sve_dup:
20633 return LowerSVEIntrinsicDUP(N, DAG);
20634 case Intrinsic::aarch64_sve_dup_x:
20635 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20636 N->getOperand(1));
20637 case Intrinsic::aarch64_sve_ext:
20638 return LowerSVEIntrinsicEXT(N, DAG);
20639 case Intrinsic::aarch64_sve_mul_u:
20640 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20641 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20642 case Intrinsic::aarch64_sve_smulh_u:
20643 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20644 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20645 case Intrinsic::aarch64_sve_umulh_u:
20646 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20647 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20648 case Intrinsic::aarch64_sve_smin_u:
20649 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20650 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20651 case Intrinsic::aarch64_sve_umin_u:
20652 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20653 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20654 case Intrinsic::aarch64_sve_smax_u:
20655 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20656 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20657 case Intrinsic::aarch64_sve_umax_u:
20658 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20659 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20660 case Intrinsic::aarch64_sve_lsl_u:
20661 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20662 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20663 case Intrinsic::aarch64_sve_lsr_u:
20664 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20665 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20666 case Intrinsic::aarch64_sve_asr_u:
20667 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20668 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20669 case Intrinsic::aarch64_sve_fadd_u:
20670 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20671 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20672 case Intrinsic::aarch64_sve_fdiv_u:
20673 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20674 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20675 case Intrinsic::aarch64_sve_fmax_u:
20676 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20677 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20678 case Intrinsic::aarch64_sve_fmaxnm_u:
20679 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20680 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20681 case Intrinsic::aarch64_sve_fmla_u:
20682 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20683 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20684 N->getOperand(2));
20685 case Intrinsic::aarch64_sve_fmin_u:
20686 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20687 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20688 case Intrinsic::aarch64_sve_fminnm_u:
20689 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20690 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20691 case Intrinsic::aarch64_sve_fmul_u:
20692 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20693 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20694 case Intrinsic::aarch64_sve_fsub_u:
20695 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20696 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20697 case Intrinsic::aarch64_sve_add_u:
20698 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20699 N->getOperand(3));
20700 case Intrinsic::aarch64_sve_sub_u:
20701 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20702 N->getOperand(3));
20703 case Intrinsic::aarch64_sve_subr:
20704 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20705 case Intrinsic::aarch64_sve_and_u:
20706 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20707 N->getOperand(3));
20708 case Intrinsic::aarch64_sve_bic_u:
20709 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20710 N->getOperand(2), N->getOperand(3));
20711 case Intrinsic::aarch64_sve_eor_u:
20712 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20713 N->getOperand(3));
20714 case Intrinsic::aarch64_sve_orr_u:
20715 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20716 N->getOperand(3));
20717 case Intrinsic::aarch64_sve_sabd_u:
20718 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20719 N->getOperand(2), N->getOperand(3));
20720 case Intrinsic::aarch64_sve_uabd_u:
20721 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20722 N->getOperand(2), N->getOperand(3));
20723 case Intrinsic::aarch64_sve_sdiv_u:
20724 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20725 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20726 case Intrinsic::aarch64_sve_udiv_u:
20727 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20728 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20729 case Intrinsic::aarch64_sve_sqadd:
20730 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20731 case Intrinsic::aarch64_sve_sqsub_u:
20732 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20733 N->getOperand(2), N->getOperand(3));
20734 case Intrinsic::aarch64_sve_uqadd:
20735 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20736 case Intrinsic::aarch64_sve_uqsub_u:
20737 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20738 N->getOperand(2), N->getOperand(3));
20739 case Intrinsic::aarch64_sve_sqadd_x:
20740 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20741 N->getOperand(1), N->getOperand(2));
20742 case Intrinsic::aarch64_sve_sqsub_x:
20743 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20744 N->getOperand(1), N->getOperand(2));
20745 case Intrinsic::aarch64_sve_uqadd_x:
20746 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20747 N->getOperand(1), N->getOperand(2));
20748 case Intrinsic::aarch64_sve_uqsub_x:
20749 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20750 N->getOperand(1), N->getOperand(2));
20751 case Intrinsic::aarch64_sve_asrd:
20752 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20753 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20754 case Intrinsic::aarch64_sve_cmphs:
20755 if (!N->getOperand(2).getValueType().isFloatingPoint())
20757 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20758 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20759 break;
20760 case Intrinsic::aarch64_sve_cmphi:
20761 if (!N->getOperand(2).getValueType().isFloatingPoint())
20763 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20764 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20765 break;
20766 case Intrinsic::aarch64_sve_fcmpge:
20767 case Intrinsic::aarch64_sve_cmpge:
20769 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20770 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20771 break;
20772 case Intrinsic::aarch64_sve_fcmpgt:
20773 case Intrinsic::aarch64_sve_cmpgt:
20775 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20776 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20777 break;
20778 case Intrinsic::aarch64_sve_fcmpeq:
20779 case Intrinsic::aarch64_sve_cmpeq:
20781 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20782 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20783 break;
20784 case Intrinsic::aarch64_sve_fcmpne:
20785 case Intrinsic::aarch64_sve_cmpne:
20787 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20788 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20789 break;
20790 case Intrinsic::aarch64_sve_fcmpuo:
20792 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20793 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20794 break;
20795 case Intrinsic::aarch64_sve_fadda:
20797 case Intrinsic::aarch64_sve_faddv:
20799 case Intrinsic::aarch64_sve_fmaxnmv:
20801 case Intrinsic::aarch64_sve_fmaxv:
20803 case Intrinsic::aarch64_sve_fminnmv:
20805 case Intrinsic::aarch64_sve_fminv:
20807 case Intrinsic::aarch64_sve_sel:
20808 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20809 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20810 case Intrinsic::aarch64_sve_cmpeq_wide:
20811 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20812 case Intrinsic::aarch64_sve_cmpne_wide:
20813 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20814 case Intrinsic::aarch64_sve_cmpge_wide:
20815 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20816 case Intrinsic::aarch64_sve_cmpgt_wide:
20817 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20818 case Intrinsic::aarch64_sve_cmplt_wide:
20819 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20820 case Intrinsic::aarch64_sve_cmple_wide:
20821 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20822 case Intrinsic::aarch64_sve_cmphs_wide:
20823 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20824 case Intrinsic::aarch64_sve_cmphi_wide:
20825 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20826 case Intrinsic::aarch64_sve_cmplo_wide:
20827 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20828 case Intrinsic::aarch64_sve_cmpls_wide:
20829 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20830 case Intrinsic::aarch64_sve_ptest_any:
20831 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20833 case Intrinsic::aarch64_sve_ptest_first:
20834 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20836 case Intrinsic::aarch64_sve_ptest_last:
20837 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20839 }
20840 return SDValue();
20841}
20842
20843static bool isCheapToExtend(const SDValue &N) {
20844 unsigned OC = N->getOpcode();
20845 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20847}
20848
20849static SDValue
20851 SelectionDAG &DAG) {
20852 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20853 // we can move the sext into the arguments and have the same result. For
20854 // example, if A and B are both loads, we can make those extending loads and
20855 // avoid an extra instruction. This pattern appears often in VLS code
20856 // generation where the inputs to the setcc have a different size to the
20857 // instruction that wants to use the result of the setcc.
20858 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20859 N->getOperand(0)->getOpcode() == ISD::SETCC);
20860 const SDValue SetCC = N->getOperand(0);
20861
20862 const SDValue CCOp0 = SetCC.getOperand(0);
20863 const SDValue CCOp1 = SetCC.getOperand(1);
20864 if (!CCOp0->getValueType(0).isInteger() ||
20865 !CCOp1->getValueType(0).isInteger())
20866 return SDValue();
20867
20868 ISD::CondCode Code =
20869 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20870
20871 ISD::NodeType ExtType =
20872 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20873
20874 if (isCheapToExtend(SetCC.getOperand(0)) &&
20875 isCheapToExtend(SetCC.getOperand(1))) {
20876 const SDValue Ext1 =
20877 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20878 const SDValue Ext2 =
20879 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20880
20881 return DAG.getSetCC(
20882 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20883 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20884 }
20885
20886 return SDValue();
20887}
20888
20891 SelectionDAG &DAG) {
20892 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20893 // we can convert that DUP into another extract_high (of a bigger DUP), which
20894 // helps the backend to decide that an sabdl2 would be useful, saving a real
20895 // extract_high operation.
20896 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20897 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20898 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20899 SDNode *ABDNode = N->getOperand(0).getNode();
20900 SDValue NewABD =
20902 if (!NewABD.getNode())
20903 return SDValue();
20904
20905 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20906 }
20907
20908 if (N->getValueType(0).isFixedLengthVector() &&
20909 N->getOpcode() == ISD::SIGN_EXTEND &&
20910 N->getOperand(0)->getOpcode() == ISD::SETCC)
20911 return performSignExtendSetCCCombine(N, DCI, DAG);
20912
20913 return SDValue();
20914}
20915
20917 SDValue SplatVal, unsigned NumVecElts) {
20918 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20919 Align OrigAlignment = St.getAlign();
20920 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20921
20922 // Create scalar stores. This is at least as good as the code sequence for a
20923 // split unaligned store which is a dup.s, ext.b, and two stores.
20924 // Most of the time the three stores should be replaced by store pair
20925 // instructions (stp).
20926 SDLoc DL(&St);
20927 SDValue BasePtr = St.getBasePtr();
20928 uint64_t BaseOffset = 0;
20929
20930 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20931 SDValue NewST1 =
20932 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20933 OrigAlignment, St.getMemOperand()->getFlags());
20934
20935 // As this in ISel, we will not merge this add which may degrade results.
20936 if (BasePtr->getOpcode() == ISD::ADD &&
20937 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20938 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20939 BasePtr = BasePtr->getOperand(0);
20940 }
20941
20942 unsigned Offset = EltOffset;
20943 while (--NumVecElts) {
20944 Align Alignment = commonAlignment(OrigAlignment, Offset);
20945 SDValue OffsetPtr =
20946 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20947 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20948 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20949 PtrInfo.getWithOffset(Offset), Alignment,
20950 St.getMemOperand()->getFlags());
20951 Offset += EltOffset;
20952 }
20953 return NewST1;
20954}
20955
20956// Returns an SVE type that ContentTy can be trivially sign or zero extended
20957// into.
20958static MVT getSVEContainerType(EVT ContentTy) {
20959 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20960
20961 switch (ContentTy.getSimpleVT().SimpleTy) {
20962 default:
20963 llvm_unreachable("No known SVE container for this MVT type");
20964 case MVT::nxv2i8:
20965 case MVT::nxv2i16:
20966 case MVT::nxv2i32:
20967 case MVT::nxv2i64:
20968 case MVT::nxv2f32:
20969 case MVT::nxv2f64:
20970 return MVT::nxv2i64;
20971 case MVT::nxv4i8:
20972 case MVT::nxv4i16:
20973 case MVT::nxv4i32:
20974 case MVT::nxv4f32:
20975 return MVT::nxv4i32;
20976 case MVT::nxv8i8:
20977 case MVT::nxv8i16:
20978 case MVT::nxv8f16:
20979 case MVT::nxv8bf16:
20980 return MVT::nxv8i16;
20981 case MVT::nxv16i8:
20982 return MVT::nxv16i8;
20983 }
20984}
20985
20986static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20987 SDLoc DL(N);
20988 EVT VT = N->getValueType(0);
20989
20991 return SDValue();
20992
20993 EVT ContainerVT = VT;
20994 if (ContainerVT.isInteger())
20995 ContainerVT = getSVEContainerType(ContainerVT);
20996
20997 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20998 SDValue Ops[] = { N->getOperand(0), // Chain
20999 N->getOperand(2), // Pg
21000 N->getOperand(3), // Base
21001 DAG.getValueType(VT) };
21002
21003 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21004 SDValue LoadChain = SDValue(Load.getNode(), 1);
21005
21006 if (ContainerVT.isInteger() && (VT != ContainerVT))
21007 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21008
21009 return DAG.getMergeValues({ Load, LoadChain }, DL);
21010}
21011
21013 SDLoc DL(N);
21014 EVT VT = N->getValueType(0);
21015 EVT PtrTy = N->getOperand(3).getValueType();
21016
21017 EVT LoadVT = VT;
21018 if (VT.isFloatingPoint())
21019 LoadVT = VT.changeTypeToInteger();
21020
21021 auto *MINode = cast<MemIntrinsicSDNode>(N);
21022 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21023 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21024 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21025 MINode->getOperand(2), PassThru,
21026 MINode->getMemoryVT(), MINode->getMemOperand(),
21028
21029 if (VT.isFloatingPoint()) {
21030 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21031 return DAG.getMergeValues(Ops, DL);
21032 }
21033
21034 return L;
21035}
21036
21037template <unsigned Opcode>
21039 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21041 "Unsupported opcode.");
21042 SDLoc DL(N);
21043 EVT VT = N->getValueType(0);
21044
21045 EVT LoadVT = VT;
21046 if (VT.isFloatingPoint())
21047 LoadVT = VT.changeTypeToInteger();
21048
21049 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21050 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21051 SDValue LoadChain = SDValue(Load.getNode(), 1);
21052
21053 if (VT.isFloatingPoint())
21054 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21055
21056 return DAG.getMergeValues({Load, LoadChain}, DL);
21057}
21058
21060 SDLoc DL(N);
21061 SDValue Data = N->getOperand(2);
21062 EVT DataVT = Data.getValueType();
21063 EVT HwSrcVt = getSVEContainerType(DataVT);
21064 SDValue InputVT = DAG.getValueType(DataVT);
21065
21066 if (DataVT.isFloatingPoint())
21067 InputVT = DAG.getValueType(HwSrcVt);
21068
21069 SDValue SrcNew;
21070 if (Data.getValueType().isFloatingPoint())
21071 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21072 else
21073 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21074
21075 SDValue Ops[] = { N->getOperand(0), // Chain
21076 SrcNew,
21077 N->getOperand(4), // Base
21078 N->getOperand(3), // Pg
21079 InputVT
21080 };
21081
21082 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21083}
21084
21086 SDLoc DL(N);
21087
21088 SDValue Data = N->getOperand(2);
21089 EVT DataVT = Data.getValueType();
21090 EVT PtrTy = N->getOperand(4).getValueType();
21091
21092 if (DataVT.isFloatingPoint())
21094
21095 auto *MINode = cast<MemIntrinsicSDNode>(N);
21096 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21097 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21098 MINode->getMemoryVT(), MINode->getMemOperand(),
21099 ISD::UNINDEXED, false, false);
21100}
21101
21102/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21103/// load store optimizer pass will merge them to store pair stores. This should
21104/// be better than a movi to create the vector zero followed by a vector store
21105/// if the zero constant is not re-used, since one instructions and one register
21106/// live range will be removed.
21107///
21108/// For example, the final generated code should be:
21109///
21110/// stp xzr, xzr, [x0]
21111///
21112/// instead of:
21113///
21114/// movi v0.2d, #0
21115/// str q0, [x0]
21116///
21118 SDValue StVal = St.getValue();
21119 EVT VT = StVal.getValueType();
21120
21121 // Avoid scalarizing zero splat stores for scalable vectors.
21122 if (VT.isScalableVector())
21123 return SDValue();
21124
21125 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21126 // 2, 3 or 4 i32 elements.
21127 int NumVecElts = VT.getVectorNumElements();
21128 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21129 VT.getVectorElementType().getSizeInBits() == 64) ||
21130 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21131 VT.getVectorElementType().getSizeInBits() == 32)))
21132 return SDValue();
21133
21134 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21135 return SDValue();
21136
21137 // If the zero constant has more than one use then the vector store could be
21138 // better since the constant mov will be amortized and stp q instructions
21139 // should be able to be formed.
21140 if (!StVal.hasOneUse())
21141 return SDValue();
21142
21143 // If the store is truncating then it's going down to i16 or smaller, which
21144 // means it can be implemented in a single store anyway.
21145 if (St.isTruncatingStore())
21146 return SDValue();
21147
21148 // If the immediate offset of the address operand is too large for the stp
21149 // instruction, then bail out.
21150 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21151 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21152 if (Offset < -512 || Offset > 504)
21153 return SDValue();
21154 }
21155
21156 for (int I = 0; I < NumVecElts; ++I) {
21157 SDValue EltVal = StVal.getOperand(I);
21158 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21159 return SDValue();
21160 }
21161
21162 // Use a CopyFromReg WZR/XZR here to prevent
21163 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21164 SDLoc DL(&St);
21165 unsigned ZeroReg;
21166 EVT ZeroVT;
21167 if (VT.getVectorElementType().getSizeInBits() == 32) {
21168 ZeroReg = AArch64::WZR;
21169 ZeroVT = MVT::i32;
21170 } else {
21171 ZeroReg = AArch64::XZR;
21172 ZeroVT = MVT::i64;
21173 }
21174 SDValue SplatVal =
21175 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21176 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21177}
21178
21179/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21180/// value. The load store optimizer pass will merge them to store pair stores.
21181/// This has better performance than a splat of the scalar followed by a split
21182/// vector store. Even if the stores are not merged it is four stores vs a dup,
21183/// followed by an ext.b and two stores.
21185 SDValue StVal = St.getValue();
21186 EVT VT = StVal.getValueType();
21187
21188 // Don't replace floating point stores, they possibly won't be transformed to
21189 // stp because of the store pair suppress pass.
21190 if (VT.isFloatingPoint())
21191 return SDValue();
21192
21193 // We can express a splat as store pair(s) for 2 or 4 elements.
21194 unsigned NumVecElts = VT.getVectorNumElements();
21195 if (NumVecElts != 4 && NumVecElts != 2)
21196 return SDValue();
21197
21198 // If the store is truncating then it's going down to i16 or smaller, which
21199 // means it can be implemented in a single store anyway.
21200 if (St.isTruncatingStore())
21201 return SDValue();
21202
21203 // Check that this is a splat.
21204 // Make sure that each of the relevant vector element locations are inserted
21205 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21206 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21207 SDValue SplatVal;
21208 for (unsigned I = 0; I < NumVecElts; ++I) {
21209 // Check for insert vector elements.
21210 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21211 return SDValue();
21212
21213 // Check that same value is inserted at each vector element.
21214 if (I == 0)
21215 SplatVal = StVal.getOperand(1);
21216 else if (StVal.getOperand(1) != SplatVal)
21217 return SDValue();
21218
21219 // Check insert element index.
21220 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21221 if (!CIndex)
21222 return SDValue();
21223 uint64_t IndexVal = CIndex->getZExtValue();
21224 if (IndexVal >= NumVecElts)
21225 return SDValue();
21226 IndexNotInserted.reset(IndexVal);
21227
21228 StVal = StVal.getOperand(0);
21229 }
21230 // Check that all vector element locations were inserted to.
21231 if (IndexNotInserted.any())
21232 return SDValue();
21233
21234 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21235}
21236
21238 SelectionDAG &DAG,
21239 const AArch64Subtarget *Subtarget) {
21240
21241 StoreSDNode *S = cast<StoreSDNode>(N);
21242 if (S->isVolatile() || S->isIndexed())
21243 return SDValue();
21244
21245 SDValue StVal = S->getValue();
21246 EVT VT = StVal.getValueType();
21247
21248 if (!VT.isFixedLengthVector())
21249 return SDValue();
21250
21251 // If we get a splat of zeros, convert this vector store to a store of
21252 // scalars. They will be merged into store pairs of xzr thereby removing one
21253 // instruction and one register.
21254 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21255 return ReplacedZeroSplat;
21256
21257 // FIXME: The logic for deciding if an unaligned store should be split should
21258 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21259 // a call to that function here.
21260
21261 if (!Subtarget->isMisaligned128StoreSlow())
21262 return SDValue();
21263
21264 // Don't split at -Oz.
21266 return SDValue();
21267
21268 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21269 // those up regresses performance on micro-benchmarks and olden/bh.
21270 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21271 return SDValue();
21272
21273 // Split unaligned 16B stores. They are terrible for performance.
21274 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21275 // extensions can use this to mark that it does not want splitting to happen
21276 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21277 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21278 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21279 S->getAlign() <= Align(2))
21280 return SDValue();
21281
21282 // If we get a splat of a scalar convert this vector store to a store of
21283 // scalars. They will be merged into store pairs thereby removing two
21284 // instructions.
21285 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21286 return ReplacedSplat;
21287
21288 SDLoc DL(S);
21289
21290 // Split VT into two.
21291 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21292 unsigned NumElts = HalfVT.getVectorNumElements();
21293 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21294 DAG.getConstant(0, DL, MVT::i64));
21295 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21296 DAG.getConstant(NumElts, DL, MVT::i64));
21297 SDValue BasePtr = S->getBasePtr();
21298 SDValue NewST1 =
21299 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21300 S->getAlign(), S->getMemOperand()->getFlags());
21301 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21302 DAG.getConstant(8, DL, MVT::i64));
21303 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21304 S->getPointerInfo(), S->getAlign(),
21305 S->getMemOperand()->getFlags());
21306}
21307
21309 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21310
21311 // splice(pg, op1, undef) -> op1
21312 if (N->getOperand(2).isUndef())
21313 return N->getOperand(1);
21314
21315 return SDValue();
21316}
21317
21319 const AArch64Subtarget *Subtarget) {
21320 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21321 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21322 "Unexpected Opcode!");
21323
21324 // uunpklo/hi undef -> undef
21325 if (N->getOperand(0).isUndef())
21326 return DAG.getUNDEF(N->getValueType(0));
21327
21328 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21329 // extending load. We can do this even if this is already a masked
21330 // {z,}extload.
21331 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21332 N->getOpcode() == AArch64ISD::UUNPKLO) {
21333 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21334 SDValue Mask = MLD->getMask();
21335 SDLoc DL(N);
21336
21337 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21338 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21339 (MLD->getPassThru()->isUndef() ||
21340 isZerosVector(MLD->getPassThru().getNode()))) {
21341 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21342 unsigned PgPattern = Mask->getConstantOperandVal(0);
21343 EVT VT = N->getValueType(0);
21344
21345 // Ensure we can double the size of the predicate pattern
21346 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21347 if (NumElts &&
21348 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21349 Mask =
21350 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21351 SDValue PassThru = DAG.getConstant(0, DL, VT);
21352 SDValue NewLoad = DAG.getMaskedLoad(
21353 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21354 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21356
21357 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21358
21359 return NewLoad;
21360 }
21361 }
21362 }
21363
21364 return SDValue();
21365}
21366
21368 if (N->getOpcode() != AArch64ISD::UZP1)
21369 return false;
21370 SDValue Op0 = N->getOperand(0);
21371 EVT SrcVT = Op0->getValueType(0);
21372 EVT DstVT = N->getValueType(0);
21373 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21374 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21375 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21376}
21377
21378// Try to combine rounding shifts where the operands come from an extend, and
21379// the result is truncated and combined into one vector.
21380// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21382 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21383 SDValue Op0 = N->getOperand(0);
21384 SDValue Op1 = N->getOperand(1);
21385 EVT ResVT = N->getValueType(0);
21386
21387 unsigned RshOpc = Op0.getOpcode();
21388 if (RshOpc != AArch64ISD::RSHRNB_I)
21389 return SDValue();
21390
21391 // Same op code and imm value?
21392 SDValue ShiftValue = Op0.getOperand(1);
21393 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21394 return SDValue();
21395
21396 // Same unextended operand value?
21397 SDValue Lo = Op0.getOperand(0);
21398 SDValue Hi = Op1.getOperand(0);
21399 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21400 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21401 return SDValue();
21402 SDValue OrigArg = Lo.getOperand(0);
21403 if (OrigArg != Hi.getOperand(0))
21404 return SDValue();
21405
21406 SDLoc DL(N);
21407 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21408 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21409 ShiftValue);
21410}
21411
21412// Try to simplify:
21413// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21414// t2 = nxv8i16 srl(t1, ShiftValue)
21415// to
21416// t1 = nxv8i16 rshrnb(X, shiftvalue).
21417// rshrnb will zero the top half bits of each element. Therefore, this combine
21418// should only be performed when a following instruction with the rshrnb
21419// as an operand does not care about the top half of each element. For example,
21420// a uzp1 or a truncating store.
21422 const AArch64Subtarget *Subtarget) {
21423 EVT VT = Srl->getValueType(0);
21424 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21425 return SDValue();
21426
21427 EVT ResVT;
21428 if (VT == MVT::nxv8i16)
21429 ResVT = MVT::nxv16i8;
21430 else if (VT == MVT::nxv4i32)
21431 ResVT = MVT::nxv8i16;
21432 else if (VT == MVT::nxv2i64)
21433 ResVT = MVT::nxv4i32;
21434 else
21435 return SDValue();
21436
21437 SDLoc DL(Srl);
21438 unsigned ShiftValue;
21439 SDValue RShOperand;
21440 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21441 return SDValue();
21442 SDValue Rshrnb = DAG.getNode(
21443 AArch64ISD::RSHRNB_I, DL, ResVT,
21444 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21445 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21446}
21447
21449 const AArch64Subtarget *Subtarget) {
21450 SDLoc DL(N);
21451 SDValue Op0 = N->getOperand(0);
21452 SDValue Op1 = N->getOperand(1);
21453 EVT ResVT = N->getValueType(0);
21454
21455 // uzp1(x, undef) -> concat(truncate(x), undef)
21456 if (Op1.getOpcode() == ISD::UNDEF) {
21457 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21458 switch (ResVT.getSimpleVT().SimpleTy) {
21459 default:
21460 break;
21461 case MVT::v16i8:
21462 BCVT = MVT::v8i16;
21463 HalfVT = MVT::v8i8;
21464 break;
21465 case MVT::v8i16:
21466 BCVT = MVT::v4i32;
21467 HalfVT = MVT::v4i16;
21468 break;
21469 case MVT::v4i32:
21470 BCVT = MVT::v2i64;
21471 HalfVT = MVT::v2i32;
21472 break;
21473 }
21474 if (BCVT != MVT::Other) {
21475 SDValue BC = DAG.getBitcast(BCVT, Op0);
21476 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21477 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21478 DAG.getUNDEF(HalfVT));
21479 }
21480 }
21481
21482 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21483 return Urshr;
21484
21485 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21486 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21487
21488 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21489 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21490
21491 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21492 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21493 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21494 SDValue X = Op0.getOperand(0).getOperand(0);
21495 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21496 }
21497 }
21498
21499 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21500 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21501 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21502 SDValue Z = Op1.getOperand(0).getOperand(1);
21503 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21504 }
21505 }
21506
21507 // These optimizations only work on little endian.
21508 if (!DAG.getDataLayout().isLittleEndian())
21509 return SDValue();
21510
21511 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21512 // Example:
21513 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21514 // to
21515 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21517 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21518 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21519 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21520 Op1.getOperand(0));
21521 }
21522 }
21523
21524 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21525 return SDValue();
21526
21527 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21528 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21529
21530 // truncating uzp1(x, y) -> xtn(concat (x, y))
21531 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21532 EVT Op0Ty = SourceOp0.getValueType();
21533 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21534 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21535 SDValue Concat =
21538 SourceOp0, SourceOp1);
21539 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21540 }
21541 }
21542
21543 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21544 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21545 SourceOp1.getOpcode() != ISD::TRUNCATE)
21546 return SDValue();
21547 SourceOp0 = SourceOp0.getOperand(0);
21548 SourceOp1 = SourceOp1.getOperand(0);
21549
21550 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21551 !SourceOp0.getValueType().isSimple())
21552 return SDValue();
21553
21554 EVT ResultTy;
21555
21556 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21557 case MVT::v2i64:
21558 ResultTy = MVT::v4i32;
21559 break;
21560 case MVT::v4i32:
21561 ResultTy = MVT::v8i16;
21562 break;
21563 case MVT::v8i16:
21564 ResultTy = MVT::v16i8;
21565 break;
21566 default:
21567 return SDValue();
21568 }
21569
21570 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21571 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21572 SDValue UzpResult =
21573 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21574
21575 EVT BitcastResultTy;
21576
21577 switch (ResVT.getSimpleVT().SimpleTy) {
21578 case MVT::v2i32:
21579 BitcastResultTy = MVT::v2i64;
21580 break;
21581 case MVT::v4i16:
21582 BitcastResultTy = MVT::v4i32;
21583 break;
21584 case MVT::v8i8:
21585 BitcastResultTy = MVT::v8i16;
21586 break;
21587 default:
21588 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21589 }
21590
21591 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21592 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21593}
21594
21596 unsigned Opc = N->getOpcode();
21597
21598 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21600 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21602 "Invalid opcode.");
21603
21604 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21606 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21608 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21612
21613 SDLoc DL(N);
21614 SDValue Chain = N->getOperand(0);
21615 SDValue Pg = N->getOperand(1);
21616 SDValue Base = N->getOperand(2);
21617 SDValue Offset = N->getOperand(3);
21618 SDValue Ty = N->getOperand(4);
21619
21620 EVT ResVT = N->getValueType(0);
21621
21622 const auto OffsetOpc = Offset.getOpcode();
21623 const bool OffsetIsZExt =
21625 const bool OffsetIsSExt =
21627
21628 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21629 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21630 SDValue ExtPg = Offset.getOperand(0);
21631 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21632 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21633
21634 // If the predicate for the sign- or zero-extended offset is the
21635 // same as the predicate used for this load and the sign-/zero-extension
21636 // was from a 32-bits...
21637 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21638 SDValue UnextendedOffset = Offset.getOperand(1);
21639
21640 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21641 if (Signed)
21642 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21643
21644 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21645 {Chain, Pg, Base, UnextendedOffset, Ty});
21646 }
21647 }
21648
21649 return SDValue();
21650}
21651
21652/// Optimize a vector shift instruction and its operand if shifted out
21653/// bits are not used.
21655 const AArch64TargetLowering &TLI,
21657 assert(N->getOpcode() == AArch64ISD::VASHR ||
21658 N->getOpcode() == AArch64ISD::VLSHR);
21659
21660 SDValue Op = N->getOperand(0);
21661 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21662
21663 unsigned ShiftImm = N->getConstantOperandVal(1);
21664 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21665
21666 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21667 if (N->getOpcode() == AArch64ISD::VASHR &&
21668 Op.getOpcode() == AArch64ISD::VSHL &&
21669 N->getOperand(1) == Op.getOperand(1))
21670 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21671 return Op.getOperand(0);
21672
21673 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21674 APInt DemandedMask = ~ShiftedOutBits;
21675
21676 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21677 return SDValue(N, 0);
21678
21679 return SDValue();
21680}
21681
21683 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21684 // This transform works in partnership with performSetCCPunpkCombine to
21685 // remove unnecessary transfer of predicates into standard registers and back
21686 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21687 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21688 MVT::i1) {
21689 SDValue CC = N->getOperand(0)->getOperand(0);
21690 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21691 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21692 DAG.getVectorIdxConstant(0, SDLoc(N)));
21693 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21694 }
21695
21696 return SDValue();
21697}
21698
21699/// Target-specific DAG combine function for post-increment LD1 (lane) and
21700/// post-increment LD1R.
21703 bool IsLaneOp) {
21704 if (DCI.isBeforeLegalizeOps())
21705 return SDValue();
21706
21707 SelectionDAG &DAG = DCI.DAG;
21708 EVT VT = N->getValueType(0);
21709
21710 if (!VT.is128BitVector() && !VT.is64BitVector())
21711 return SDValue();
21712
21713 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21714 SDNode *LD = N->getOperand(LoadIdx).getNode();
21715 // If it is not LOAD, can not do such combine.
21716 if (LD->getOpcode() != ISD::LOAD)
21717 return SDValue();
21718
21719 // The vector lane must be a constant in the LD1LANE opcode.
21720 SDValue Lane;
21721 if (IsLaneOp) {
21722 Lane = N->getOperand(2);
21723 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21724 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21725 return SDValue();
21726 }
21727
21728 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21729 EVT MemVT = LoadSDN->getMemoryVT();
21730 // Check if memory operand is the same type as the vector element.
21731 if (MemVT != VT.getVectorElementType())
21732 return SDValue();
21733
21734 // Check if there are other uses. If so, do not combine as it will introduce
21735 // an extra load.
21736 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21737 ++UI) {
21738 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21739 continue;
21740 if (*UI != N)
21741 return SDValue();
21742 }
21743
21744 // If there is one use and it can splat the value, prefer that operation.
21745 // TODO: This could be expanded to more operations if they reliably use the
21746 // index variants.
21747 if (N->hasOneUse()) {
21748 unsigned UseOpc = N->use_begin()->getOpcode();
21749 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21750 return SDValue();
21751 }
21752
21753 SDValue Addr = LD->getOperand(1);
21754 SDValue Vector = N->getOperand(0);
21755 // Search for a use of the address operand that is an increment.
21756 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21757 Addr.getNode()->use_end(); UI != UE; ++UI) {
21758 SDNode *User = *UI;
21759 if (User->getOpcode() != ISD::ADD
21760 || UI.getUse().getResNo() != Addr.getResNo())
21761 continue;
21762
21763 // If the increment is a constant, it must match the memory ref size.
21764 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21765 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21766 uint32_t IncVal = CInc->getZExtValue();
21767 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21768 if (IncVal != NumBytes)
21769 continue;
21770 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21771 }
21772
21773 // To avoid cycle construction make sure that neither the load nor the add
21774 // are predecessors to each other or the Vector.
21777 Visited.insert(Addr.getNode());
21778 Worklist.push_back(User);
21779 Worklist.push_back(LD);
21780 Worklist.push_back(Vector.getNode());
21781 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21782 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21783 continue;
21784
21786 Ops.push_back(LD->getOperand(0)); // Chain
21787 if (IsLaneOp) {
21788 Ops.push_back(Vector); // The vector to be inserted
21789 Ops.push_back(Lane); // The lane to be inserted in the vector
21790 }
21791 Ops.push_back(Addr);
21792 Ops.push_back(Inc);
21793
21794 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21795 SDVTList SDTys = DAG.getVTList(Tys);
21796 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21797 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21798 MemVT,
21799 LoadSDN->getMemOperand());
21800
21801 // Update the uses.
21802 SDValue NewResults[] = {
21803 SDValue(LD, 0), // The result of load
21804 SDValue(UpdN.getNode(), 2) // Chain
21805 };
21806 DCI.CombineTo(LD, NewResults);
21807 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21808 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21809
21810 break;
21811 }
21812 return SDValue();
21813}
21814
21815/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21816/// address translation.
21819 SelectionDAG &DAG) {
21820 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21821 KnownBits Known;
21823 !DCI.isBeforeLegalizeOps());
21824 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21825 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21826 DCI.CommitTargetLoweringOpt(TLO);
21827 return true;
21828 }
21829 return false;
21830}
21831
21833 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21834 "Expected STORE dag node in input!");
21835
21836 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21837 if (!Store->isTruncatingStore() || Store->isIndexed())
21838 return SDValue();
21839 SDValue Ext = Store->getValue();
21840 auto ExtOpCode = Ext.getOpcode();
21841 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21842 ExtOpCode != ISD::ANY_EXTEND)
21843 return SDValue();
21844 SDValue Orig = Ext->getOperand(0);
21845 if (Store->getMemoryVT() != Orig.getValueType())
21846 return SDValue();
21847 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21848 Store->getBasePtr(), Store->getMemOperand());
21849 }
21850
21851 return SDValue();
21852}
21853
21854// A custom combine to lower load <3 x i8> as the more efficient sequence
21855// below:
21856// ldrb wX, [x0, #2]
21857// ldrh wY, [x0]
21858// orr wX, wY, wX, lsl #16
21859// fmov s0, wX
21860//
21861// Note that an alternative sequence with even fewer (although usually more
21862// complex/expensive) instructions would be:
21863// ld1r.4h { v0 }, [x0], #2
21864// ld1.b { v0 }[2], [x0]
21865//
21866// Generating this sequence unfortunately results in noticeably worse codegen
21867// for code that extends the loaded v3i8, due to legalization breaking vector
21868// shuffle detection in a way that is very difficult to work around.
21869// TODO: Revisit once v3i8 legalization has been improved in general.
21871 EVT MemVT = LD->getMemoryVT();
21872 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21873 LD->getOriginalAlign() >= 4)
21874 return SDValue();
21875
21876 SDLoc DL(LD);
21878 SDValue Chain = LD->getChain();
21879 SDValue BasePtr = LD->getBasePtr();
21880 MachineMemOperand *MMO = LD->getMemOperand();
21881 assert(LD->getOffset().isUndef() && "undef offset expected");
21882
21883 // Load 2 x i8, then 1 x i8.
21884 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21885 TypeSize Offset2 = TypeSize::getFixed(2);
21886 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21887 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21888 MF.getMachineMemOperand(MMO, 2, 1));
21889
21890 // Extend to i32.
21891 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21892 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21893
21894 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21895 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21896 DAG.getConstant(16, DL, MVT::i32));
21897 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21898 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21899
21900 // Extract v3i8 again.
21901 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21902 DAG.getConstant(0, DL, MVT::i64));
21903 SDValue TokenFactor = DAG.getNode(
21904 ISD::TokenFactor, DL, MVT::Other,
21905 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21906 return DAG.getMergeValues({Extract, TokenFactor}, DL);
21907}
21908
21909// Perform TBI simplification if supported by the target and try to break up
21910// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21911// load instructions can be selected.
21914 SelectionDAG &DAG,
21915 const AArch64Subtarget *Subtarget) {
21916 if (Subtarget->supportsAddressTopByteIgnored())
21917 performTBISimplification(N->getOperand(1), DCI, DAG);
21918
21919 LoadSDNode *LD = cast<LoadSDNode>(N);
21920 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21921 return SDValue(N, 0);
21922
21923 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21924 return Res;
21925
21926 if (!LD->isNonTemporal())
21927 return SDValue(N, 0);
21928
21929 EVT MemVT = LD->getMemoryVT();
21930 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21931 MemVT.getSizeInBits() % 256 == 0 ||
21932 256 % MemVT.getScalarSizeInBits() != 0)
21933 return SDValue(N, 0);
21934
21935 SDLoc DL(LD);
21936 SDValue Chain = LD->getChain();
21937 SDValue BasePtr = LD->getBasePtr();
21938 SDNodeFlags Flags = LD->getFlags();
21940 SmallVector<SDValue, 4> LoadOpsChain;
21941 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21942 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21943 // loads and reduce the amount of load instructions generated.
21944 MVT NewVT =
21946 256 / MemVT.getVectorElementType().getSizeInBits());
21947 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21948 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21949 for (unsigned I = 0; I < Num256Loads; I++) {
21950 unsigned PtrOffset = I * 32;
21951 SDValue NewPtr = DAG.getMemBasePlusOffset(
21952 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21953 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21954 SDValue NewLoad = DAG.getLoad(
21955 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21956 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21957 LoadOps.push_back(NewLoad);
21958 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21959 }
21960
21961 // Process remaining bits of the load operation.
21962 // This is done by creating an UNDEF vector to match the size of the
21963 // 256-bit loads and inserting the remaining load to it. We extract the
21964 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21965 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21966 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21967 MVT RemainingVT = MVT::getVectorVT(
21969 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21970 SDValue NewPtr = DAG.getMemBasePlusOffset(
21971 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21972 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21973 SDValue RemainingLoad =
21974 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21975 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21976 LD->getMemOperand()->getFlags(), LD->getAAInfo());
21977 SDValue UndefVector = DAG.getUNDEF(NewVT);
21978 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21979 SDValue ExtendedReminingLoad =
21980 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21981 {UndefVector, RemainingLoad, InsertIdx});
21982 LoadOps.push_back(ExtendedReminingLoad);
21983 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
21984 EVT ConcatVT =
21986 LoadOps.size() * NewVT.getVectorNumElements());
21987 SDValue ConcatVectors =
21988 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
21989 // Extract the original vector type size.
21990 SDValue ExtractSubVector =
21991 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
21992 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
21993 SDValue TokenFactor =
21994 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21995 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
21996}
21997
21999 EVT VecVT = Op.getValueType();
22000 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22001 "Need boolean vector type.");
22002
22003 if (Depth > 3)
22005
22006 // We can get the base type from a vector compare or truncate.
22007 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22008 return Op.getOperand(0).getValueType();
22009
22010 // If an operand is a bool vector, continue looking.
22012 for (SDValue Operand : Op->op_values()) {
22013 if (Operand.getValueType() != VecVT)
22014 continue;
22015
22016 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22017 if (!BaseVT.isSimple())
22018 BaseVT = OperandVT;
22019 else if (OperandVT != BaseVT)
22021 }
22022
22023 return BaseVT;
22024}
22025
22026// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22027// iN, we can use a trick that extracts the i^th bit from the i^th element and
22028// then performs a vector add to get a scalar bitmask. This requires that each
22029// element's bits are either all 1 or all 0.
22031 SDLoc DL(N);
22032 SDValue ComparisonResult(N, 0);
22033 EVT VecVT = ComparisonResult.getValueType();
22034 assert(VecVT.isVector() && "Must be a vector type");
22035
22036 unsigned NumElts = VecVT.getVectorNumElements();
22037 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22038 return SDValue();
22039
22040 if (VecVT.getVectorElementType() != MVT::i1 &&
22041 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22042 return SDValue();
22043
22044 // If we can find the original types to work on instead of a vector of i1,
22045 // we can avoid extend/extract conversion instructions.
22046 if (VecVT.getVectorElementType() == MVT::i1) {
22047 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22048 if (!VecVT.isSimple()) {
22049 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22050 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22051 }
22052 }
22053 VecVT = VecVT.changeVectorElementTypeToInteger();
22054
22055 // Large vectors don't map directly to this conversion, so to avoid too many
22056 // edge cases, we don't apply it here. The conversion will likely still be
22057 // applied later via multiple smaller vectors, whose results are concatenated.
22058 if (VecVT.getSizeInBits() > 128)
22059 return SDValue();
22060
22061 // Ensure that all elements' bits are either 0s or 1s.
22062 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22063
22064 SmallVector<SDValue, 16> MaskConstants;
22065 if (VecVT == MVT::v16i8) {
22066 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22067 // per entry. We split it into two halves, apply the mask, zip the halves to
22068 // create 8x 16-bit values, and the perform the vector reduce.
22069 for (unsigned Half = 0; Half < 2; ++Half) {
22070 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22071 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22072 }
22073 }
22074 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22075 SDValue RepresentativeBits =
22076 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22077
22078 SDValue UpperRepresentativeBits =
22079 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22080 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22081 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22082 RepresentativeBits, UpperRepresentativeBits);
22083 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22084 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22085 }
22086
22087 // All other vector sizes.
22088 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22089 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22090 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22091 }
22092
22093 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22094 SDValue RepresentativeBits =
22095 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22096 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22097 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22098 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22099}
22100
22102 StoreSDNode *Store) {
22103 if (!Store->isTruncatingStore())
22104 return SDValue();
22105
22106 SDLoc DL(Store);
22107 SDValue VecOp = Store->getValue();
22108 EVT VT = VecOp.getValueType();
22109 EVT MemVT = Store->getMemoryVT();
22110
22111 if (!MemVT.isVector() || !VT.isVector() ||
22112 MemVT.getVectorElementType() != MVT::i1)
22113 return SDValue();
22114
22115 // If we are storing a vector that we are currently building, let
22116 // `scalarizeVectorStore()` handle this more efficiently.
22117 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22118 return SDValue();
22119
22120 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22121 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22122 if (!VectorBits)
22123 return SDValue();
22124
22125 EVT StoreVT =
22127 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22128 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22129 Store->getMemOperand());
22130}
22131
22133 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22134 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22135 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22136}
22137
22138// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22140 const AArch64Subtarget *Subtarget) {
22141 SDValue Value = ST->getValue();
22142 EVT ValueVT = Value.getValueType();
22143
22144 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22145 Value.getOpcode() != ISD::TRUNCATE ||
22146 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22147 return SDValue();
22148
22149 assert(ST->getOffset().isUndef() && "undef offset expected");
22150 SDLoc DL(ST);
22151 auto WideVT = EVT::getVectorVT(
22152 *DAG.getContext(),
22153 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22154 SDValue UndefVector = DAG.getUNDEF(WideVT);
22155 SDValue WideTrunc = DAG.getNode(
22156 ISD::INSERT_SUBVECTOR, DL, WideVT,
22157 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22158 SDValue Cast = DAG.getNode(
22159 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22160 WideTrunc);
22161
22163 SDValue Chain = ST->getChain();
22164 MachineMemOperand *MMO = ST->getMemOperand();
22165 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22166 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22167 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22168 TypeSize Offset2 = TypeSize::getFixed(2);
22169 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22170 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22171
22172 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22173 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22174 TypeSize Offset1 = TypeSize::getFixed(1);
22175 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22176 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22177
22178 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22179 DAG.getConstant(0, DL, MVT::i64));
22180 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22181 MF.getMachineMemOperand(MMO, 0, 1));
22182 return Chain;
22183}
22184
22187 SelectionDAG &DAG,
22188 const AArch64Subtarget *Subtarget) {
22189 StoreSDNode *ST = cast<StoreSDNode>(N);
22190 SDValue Chain = ST->getChain();
22191 SDValue Value = ST->getValue();
22192 SDValue Ptr = ST->getBasePtr();
22193 EVT ValueVT = Value.getValueType();
22194
22195 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22196 EVT EltVT = VT.getVectorElementType();
22197 return EltVT == MVT::f32 || EltVT == MVT::f64;
22198 };
22199
22200 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22201 return Res;
22202
22203 // If this is an FP_ROUND followed by a store, fold this into a truncating
22204 // store. We can do this even if this is already a truncstore.
22205 // We purposefully don't care about legality of the nodes here as we know
22206 // they can be split down into something legal.
22207 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22208 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22209 Subtarget->useSVEForFixedLengthVectors() &&
22210 ValueVT.isFixedLengthVector() &&
22211 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22212 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22213 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22214 ST->getMemoryVT(), ST->getMemOperand());
22215
22216 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22217 return Split;
22218
22219 if (Subtarget->supportsAddressTopByteIgnored() &&
22220 performTBISimplification(N->getOperand(2), DCI, DAG))
22221 return SDValue(N, 0);
22222
22223 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22224 return Store;
22225
22226 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22227 return Store;
22228
22229 if (ST->isTruncatingStore()) {
22230 EVT StoreVT = ST->getMemoryVT();
22231 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22232 return SDValue();
22233 if (SDValue Rshrnb =
22234 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22235 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22236 StoreVT, ST->getMemOperand());
22237 }
22238 }
22239
22240 return SDValue();
22241}
22242
22245 SelectionDAG &DAG,
22246 const AArch64Subtarget *Subtarget) {
22247 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22248 SDValue Value = MST->getValue();
22249 SDValue Mask = MST->getMask();
22250 SDLoc DL(N);
22251
22252 // If this is a UZP1 followed by a masked store, fold this into a masked
22253 // truncating store. We can do this even if this is already a masked
22254 // truncstore.
22255 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22256 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22257 Value.getValueType().isInteger()) {
22258 Value = Value.getOperand(0);
22259 if (Value.getOpcode() == ISD::BITCAST) {
22260 EVT HalfVT =
22261 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22262 EVT InVT = Value.getOperand(0).getValueType();
22263
22264 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22265 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22266 unsigned PgPattern = Mask->getConstantOperandVal(0);
22267
22268 // Ensure we can double the size of the predicate pattern
22269 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22270 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22271 MinSVESize) {
22272 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22273 PgPattern);
22274 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22275 MST->getBasePtr(), MST->getOffset(), Mask,
22276 MST->getMemoryVT(), MST->getMemOperand(),
22277 MST->getAddressingMode(),
22278 /*IsTruncating=*/true);
22279 }
22280 }
22281 }
22282 }
22283
22284 if (MST->isTruncatingStore()) {
22285 EVT ValueVT = Value->getValueType(0);
22286 EVT MemVT = MST->getMemoryVT();
22287 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22288 return SDValue();
22289 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22290 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22291 MST->getOffset(), MST->getMask(),
22292 MST->getMemoryVT(), MST->getMemOperand(),
22293 MST->getAddressingMode(), true);
22294 }
22295 }
22296
22297 return SDValue();
22298}
22299
22300/// \return true if part of the index was folded into the Base.
22301static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22302 SDLoc DL, SelectionDAG &DAG) {
22303 // This function assumes a vector of i64 indices.
22304 EVT IndexVT = Index.getValueType();
22305 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22306 return false;
22307
22308 // Simplify:
22309 // BasePtr = Ptr
22310 // Index = X + splat(Offset)
22311 // ->
22312 // BasePtr = Ptr + Offset * scale.
22313 // Index = X
22314 if (Index.getOpcode() == ISD::ADD) {
22315 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22316 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22317 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22318 Index = Index.getOperand(0);
22319 return true;
22320 }
22321 }
22322
22323 // Simplify:
22324 // BasePtr = Ptr
22325 // Index = (X + splat(Offset)) << splat(Shift)
22326 // ->
22327 // BasePtr = Ptr + (Offset << Shift) * scale)
22328 // Index = X << splat(shift)
22329 if (Index.getOpcode() == ISD::SHL &&
22330 Index.getOperand(0).getOpcode() == ISD::ADD) {
22331 SDValue Add = Index.getOperand(0);
22332 SDValue ShiftOp = Index.getOperand(1);
22333 SDValue OffsetOp = Add.getOperand(1);
22334 if (auto Shift = DAG.getSplatValue(ShiftOp))
22335 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22336 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22337 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22338 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22339 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22340 Add.getOperand(0), ShiftOp);
22341 return true;
22342 }
22343 }
22344
22345 return false;
22346}
22347
22348// Analyse the specified address returning true if a more optimal addressing
22349// mode is available. When returning true all parameters are updated to reflect
22350// their recommended values.
22352 SDValue &BasePtr, SDValue &Index,
22353 SelectionDAG &DAG) {
22354 // Try to iteratively fold parts of the index into the base pointer to
22355 // simplify the index as much as possible.
22356 bool Changed = false;
22357 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22358 Changed = true;
22359
22360 // Only consider element types that are pointer sized as smaller types can
22361 // be easily promoted.
22362 EVT IndexVT = Index.getValueType();
22363 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22364 return Changed;
22365
22366 // Can indices be trivially shrunk?
22367 EVT DataVT = N->getOperand(1).getValueType();
22368 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22369 // will later be re-extended to 64 bits in legalization
22370 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22371 return Changed;
22372 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22373 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22374 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22375 return true;
22376 }
22377
22378 // Match:
22379 // Index = step(const)
22380 int64_t Stride = 0;
22381 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22382 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22383 }
22384 // Match:
22385 // Index = step(const) << shift(const)
22386 else if (Index.getOpcode() == ISD::SHL &&
22387 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22388 SDValue RHS = Index.getOperand(1);
22389 if (auto *Shift =
22390 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22391 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22392 Stride = Step << Shift->getZExtValue();
22393 }
22394 }
22395
22396 // Return early because no supported pattern is found.
22397 if (Stride == 0)
22398 return Changed;
22399
22400 if (Stride < std::numeric_limits<int32_t>::min() ||
22401 Stride > std::numeric_limits<int32_t>::max())
22402 return Changed;
22403
22404 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22405 unsigned MaxVScale =
22407 int64_t LastElementOffset =
22408 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22409
22410 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22411 LastElementOffset > std::numeric_limits<int32_t>::max())
22412 return Changed;
22413
22414 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22415 // Stride does not scale explicitly by 'Scale', because it happens in
22416 // the gather/scatter addressing mode.
22417 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22418 return true;
22419}
22420
22423 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22424 assert(MGS && "Can only combine gather load or scatter store nodes");
22425
22426 if (!DCI.isBeforeLegalize())
22427 return SDValue();
22428
22429 SDLoc DL(MGS);
22430 SDValue Chain = MGS->getChain();
22431 SDValue Scale = MGS->getScale();
22432 SDValue Index = MGS->getIndex();
22433 SDValue Mask = MGS->getMask();
22434 SDValue BasePtr = MGS->getBasePtr();
22435 ISD::MemIndexType IndexType = MGS->getIndexType();
22436
22437 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22438 return SDValue();
22439
22440 // Here we catch such cases early and change MGATHER's IndexType to allow
22441 // the use of an Index that's more legalisation friendly.
22442 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22443 SDValue PassThru = MGT->getPassThru();
22444 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22445 return DAG.getMaskedGather(
22446 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22447 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22448 }
22449 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22450 SDValue Data = MSC->getValue();
22451 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22452 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22453 Ops, MSC->getMemOperand(), IndexType,
22454 MSC->isTruncatingStore());
22455}
22456
22457/// Target-specific DAG combine function for NEON load/store intrinsics
22458/// to merge base address updates.
22461 SelectionDAG &DAG) {
22462 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22463 return SDValue();
22464
22465 unsigned AddrOpIdx = N->getNumOperands() - 1;
22466 SDValue Addr = N->getOperand(AddrOpIdx);
22467
22468 // Search for a use of the address operand that is an increment.
22469 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22470 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22471 SDNode *User = *UI;
22472 if (User->getOpcode() != ISD::ADD ||
22473 UI.getUse().getResNo() != Addr.getResNo())
22474 continue;
22475
22476 // Check that the add is independent of the load/store. Otherwise, folding
22477 // it would create a cycle.
22480 Visited.insert(Addr.getNode());
22481 Worklist.push_back(N);
22482 Worklist.push_back(User);
22483 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22484 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22485 continue;
22486
22487 // Find the new opcode for the updating load/store.
22488 bool IsStore = false;
22489 bool IsLaneOp = false;
22490 bool IsDupOp = false;
22491 unsigned NewOpc = 0;
22492 unsigned NumVecs = 0;
22493 unsigned IntNo = N->getConstantOperandVal(1);
22494 switch (IntNo) {
22495 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22496 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22497 NumVecs = 2; break;
22498 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22499 NumVecs = 3; break;
22500 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22501 NumVecs = 4; break;
22502 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22503 NumVecs = 2; IsStore = true; break;
22504 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22505 NumVecs = 3; IsStore = true; break;
22506 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22507 NumVecs = 4; IsStore = true; break;
22508 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22509 NumVecs = 2; break;
22510 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22511 NumVecs = 3; break;
22512 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22513 NumVecs = 4; break;
22514 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22515 NumVecs = 2; IsStore = true; break;
22516 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22517 NumVecs = 3; IsStore = true; break;
22518 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22519 NumVecs = 4; IsStore = true; break;
22520 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22521 NumVecs = 2; IsDupOp = true; break;
22522 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22523 NumVecs = 3; IsDupOp = true; break;
22524 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22525 NumVecs = 4; IsDupOp = true; break;
22526 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22527 NumVecs = 2; IsLaneOp = true; break;
22528 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22529 NumVecs = 3; IsLaneOp = true; break;
22530 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22531 NumVecs = 4; IsLaneOp = true; break;
22532 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22533 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22534 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22535 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22536 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22537 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22538 }
22539
22540 EVT VecTy;
22541 if (IsStore)
22542 VecTy = N->getOperand(2).getValueType();
22543 else
22544 VecTy = N->getValueType(0);
22545
22546 // If the increment is a constant, it must match the memory ref size.
22547 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22548 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22549 uint32_t IncVal = CInc->getZExtValue();
22550 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22551 if (IsLaneOp || IsDupOp)
22552 NumBytes /= VecTy.getVectorNumElements();
22553 if (IncVal != NumBytes)
22554 continue;
22555 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22556 }
22558 Ops.push_back(N->getOperand(0)); // Incoming chain
22559 // Load lane and store have vector list as input.
22560 if (IsLaneOp || IsStore)
22561 for (unsigned i = 2; i < AddrOpIdx; ++i)
22562 Ops.push_back(N->getOperand(i));
22563 Ops.push_back(Addr); // Base register
22564 Ops.push_back(Inc);
22565
22566 // Return Types.
22567 EVT Tys[6];
22568 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22569 unsigned n;
22570 for (n = 0; n < NumResultVecs; ++n)
22571 Tys[n] = VecTy;
22572 Tys[n++] = MVT::i64; // Type of write back register
22573 Tys[n] = MVT::Other; // Type of the chain
22574 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22575
22576 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22577 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22578 MemInt->getMemoryVT(),
22579 MemInt->getMemOperand());
22580
22581 // Update the uses.
22582 std::vector<SDValue> NewResults;
22583 for (unsigned i = 0; i < NumResultVecs; ++i) {
22584 NewResults.push_back(SDValue(UpdN.getNode(), i));
22585 }
22586 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22587 DCI.CombineTo(N, NewResults);
22588 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22589
22590 break;
22591 }
22592 return SDValue();
22593}
22594
22595// Checks to see if the value is the prescribed width and returns information
22596// about its extension mode.
22597static
22598bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22599 ExtType = ISD::NON_EXTLOAD;
22600 switch(V.getNode()->getOpcode()) {
22601 default:
22602 return false;
22603 case ISD::LOAD: {
22604 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22605 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22606 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22607 ExtType = LoadNode->getExtensionType();
22608 return true;
22609 }
22610 return false;
22611 }
22612 case ISD::AssertSext: {
22613 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22614 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22615 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22616 ExtType = ISD::SEXTLOAD;
22617 return true;
22618 }
22619 return false;
22620 }
22621 case ISD::AssertZext: {
22622 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22623 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22624 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22625 ExtType = ISD::ZEXTLOAD;
22626 return true;
22627 }
22628 return false;
22629 }
22630 case ISD::Constant:
22631 case ISD::TargetConstant: {
22632 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22633 1LL << (width - 1);
22634 }
22635 }
22636
22637 return true;
22638}
22639
22640// This function does a whole lot of voodoo to determine if the tests are
22641// equivalent without and with a mask. Essentially what happens is that given a
22642// DAG resembling:
22643//
22644// +-------------+ +-------------+ +-------------+ +-------------+
22645// | Input | | AddConstant | | CompConstant| | CC |
22646// +-------------+ +-------------+ +-------------+ +-------------+
22647// | | | |
22648// V V | +----------+
22649// +-------------+ +----+ | |
22650// | ADD | |0xff| | |
22651// +-------------+ +----+ | |
22652// | | | |
22653// V V | |
22654// +-------------+ | |
22655// | AND | | |
22656// +-------------+ | |
22657// | | |
22658// +-----+ | |
22659// | | |
22660// V V V
22661// +-------------+
22662// | CMP |
22663// +-------------+
22664//
22665// The AND node may be safely removed for some combinations of inputs. In
22666// particular we need to take into account the extension type of the Input,
22667// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22668// width of the input (this can work for any width inputs, the above graph is
22669// specific to 8 bits.
22670//
22671// The specific equations were worked out by generating output tables for each
22672// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22673// problem was simplified by working with 4 bit inputs, which means we only
22674// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22675// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22676// patterns present in both extensions (0,7). For every distinct set of
22677// AddConstant and CompConstants bit patterns we can consider the masked and
22678// unmasked versions to be equivalent if the result of this function is true for
22679// all 16 distinct bit patterns of for the current extension type of Input (w0).
22680//
22681// sub w8, w0, w1
22682// and w10, w8, #0x0f
22683// cmp w8, w2
22684// cset w9, AArch64CC
22685// cmp w10, w2
22686// cset w11, AArch64CC
22687// cmp w9, w11
22688// cset w0, eq
22689// ret
22690//
22691// Since the above function shows when the outputs are equivalent it defines
22692// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22693// would be expensive to run during compiles. The equations below were written
22694// in a test harness that confirmed they gave equivalent outputs to the above
22695// for all inputs function, so they can be used determine if the removal is
22696// legal instead.
22697//
22698// isEquivalentMaskless() is the code for testing if the AND can be removed
22699// factored out of the DAG recognition as the DAG can take several forms.
22700
22701static bool isEquivalentMaskless(unsigned CC, unsigned width,
22702 ISD::LoadExtType ExtType, int AddConstant,
22703 int CompConstant) {
22704 // By being careful about our equations and only writing the in term
22705 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22706 // make them generally applicable to all bit widths.
22707 int MaxUInt = (1 << width);
22708
22709 // For the purposes of these comparisons sign extending the type is
22710 // equivalent to zero extending the add and displacing it by half the integer
22711 // width. Provided we are careful and make sure our equations are valid over
22712 // the whole range we can just adjust the input and avoid writing equations
22713 // for sign extended inputs.
22714 if (ExtType == ISD::SEXTLOAD)
22715 AddConstant -= (1 << (width-1));
22716
22717 switch(CC) {
22718 case AArch64CC::LE:
22719 case AArch64CC::GT:
22720 if ((AddConstant == 0) ||
22721 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22722 (AddConstant >= 0 && CompConstant < 0) ||
22723 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22724 return true;
22725 break;
22726 case AArch64CC::LT:
22727 case AArch64CC::GE:
22728 if ((AddConstant == 0) ||
22729 (AddConstant >= 0 && CompConstant <= 0) ||
22730 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22731 return true;
22732 break;
22733 case AArch64CC::HI:
22734 case AArch64CC::LS:
22735 if ((AddConstant >= 0 && CompConstant < 0) ||
22736 (AddConstant <= 0 && CompConstant >= -1 &&
22737 CompConstant < AddConstant + MaxUInt))
22738 return true;
22739 break;
22740 case AArch64CC::PL:
22741 case AArch64CC::MI:
22742 if ((AddConstant == 0) ||
22743 (AddConstant > 0 && CompConstant <= 0) ||
22744 (AddConstant < 0 && CompConstant <= AddConstant))
22745 return true;
22746 break;
22747 case AArch64CC::LO:
22748 case AArch64CC::HS:
22749 if ((AddConstant >= 0 && CompConstant <= 0) ||
22750 (AddConstant <= 0 && CompConstant >= 0 &&
22751 CompConstant <= AddConstant + MaxUInt))
22752 return true;
22753 break;
22754 case AArch64CC::EQ:
22755 case AArch64CC::NE:
22756 if ((AddConstant > 0 && CompConstant < 0) ||
22757 (AddConstant < 0 && CompConstant >= 0 &&
22758 CompConstant < AddConstant + MaxUInt) ||
22759 (AddConstant >= 0 && CompConstant >= 0 &&
22760 CompConstant >= AddConstant) ||
22761 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22762 return true;
22763 break;
22764 case AArch64CC::VS:
22765 case AArch64CC::VC:
22766 case AArch64CC::AL:
22767 case AArch64CC::NV:
22768 return true;
22769 case AArch64CC::Invalid:
22770 break;
22771 }
22772
22773 return false;
22774}
22775
22776// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22777// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22779 SDNode *AndNode, SelectionDAG &DAG,
22780 unsigned CCIndex, unsigned CmpIndex,
22781 unsigned CC) {
22782 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22783 if (!SubsC)
22784 return SDValue();
22785
22786 APInt SubsAP = SubsC->getAPIntValue();
22787 if (CC == AArch64CC::HI) {
22788 if (!SubsAP.isMask())
22789 return SDValue();
22790 } else if (CC == AArch64CC::LO) {
22791 if (!SubsAP.isPowerOf2())
22792 return SDValue();
22793 } else
22794 return SDValue();
22795
22796 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22797 if (!AndC)
22798 return SDValue();
22799
22800 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22801
22802 SDLoc DL(N);
22803 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22804 SDValue ANDS = DAG.getNode(
22805 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22806 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22807 SDValue AArch64_CC =
22809 N->getOperand(CCIndex)->getValueType(0));
22810
22811 // For now, only performCSELCombine and performBRCONDCombine call this
22812 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22813 // operands. So just init the ops direct to simplify the code. If we have some
22814 // other case with different CCIndex, CmpIndex, we need to use for loop to
22815 // rewrite the code here.
22816 // TODO: Do we need to assert number of operand is 4 here?
22817 assert((CCIndex == 2 && CmpIndex == 3) &&
22818 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22819 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22820 ANDS.getValue(1)};
22821 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22822}
22823
22824static
22827 SelectionDAG &DAG, unsigned CCIndex,
22828 unsigned CmpIndex) {
22829 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22830 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22831 unsigned CondOpcode = SubsNode->getOpcode();
22832
22833 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
22834 return SDValue();
22835
22836 // There is a SUBS feeding this condition. Is it fed by a mask we can
22837 // use?
22838
22839 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22840 unsigned MaskBits = 0;
22841
22842 if (AndNode->getOpcode() != ISD::AND)
22843 return SDValue();
22844
22845 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22846 CmpIndex, CC))
22847 return Val;
22848
22849 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22850 uint32_t CNV = CN->getZExtValue();
22851 if (CNV == 255)
22852 MaskBits = 8;
22853 else if (CNV == 65535)
22854 MaskBits = 16;
22855 }
22856
22857 if (!MaskBits)
22858 return SDValue();
22859
22860 SDValue AddValue = AndNode->getOperand(0);
22861
22862 if (AddValue.getOpcode() != ISD::ADD)
22863 return SDValue();
22864
22865 // The basic dag structure is correct, grab the inputs and validate them.
22866
22867 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22868 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22869 SDValue SubsInputValue = SubsNode->getOperand(1);
22870
22871 // The mask is present and the provenance of all the values is a smaller type,
22872 // lets see if the mask is superfluous.
22873
22874 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22875 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22876 return SDValue();
22877
22878 ISD::LoadExtType ExtType;
22879
22880 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22881 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22882 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22883 return SDValue();
22884
22885 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22886 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22887 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22888 return SDValue();
22889
22890 // The AND is not necessary, remove it.
22891
22892 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22893 SubsNode->getValueType(1));
22894 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22895
22896 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22897 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22898
22899 return SDValue(N, 0);
22900}
22901
22902// Optimize compare with zero and branch.
22905 SelectionDAG &DAG) {
22907 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22908 // will not be produced, as they are conditional branch instructions that do
22909 // not set flags.
22910 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22911 return SDValue();
22912
22913 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22914 N = NV.getNode();
22915 SDValue Chain = N->getOperand(0);
22916 SDValue Dest = N->getOperand(1);
22917 SDValue CCVal = N->getOperand(2);
22918 SDValue Cmp = N->getOperand(3);
22919
22920 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22921 unsigned CC = CCVal->getAsZExtVal();
22922 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22923 return SDValue();
22924
22925 unsigned CmpOpc = Cmp.getOpcode();
22926 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22927 return SDValue();
22928
22929 // Only attempt folding if there is only one use of the flag and no use of the
22930 // value.
22931 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22932 return SDValue();
22933
22934 SDValue LHS = Cmp.getOperand(0);
22935 SDValue RHS = Cmp.getOperand(1);
22936
22937 assert(LHS.getValueType() == RHS.getValueType() &&
22938 "Expected the value type to be the same for both operands!");
22939 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22940 return SDValue();
22941
22942 if (isNullConstant(LHS))
22943 std::swap(LHS, RHS);
22944
22945 if (!isNullConstant(RHS))
22946 return SDValue();
22947
22948 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22949 LHS.getOpcode() == ISD::SRL)
22950 return SDValue();
22951
22952 // Fold the compare into the branch instruction.
22953 SDValue BR;
22954 if (CC == AArch64CC::EQ)
22955 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22956 else
22957 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22958
22959 // Do not add new nodes to DAG combiner worklist.
22960 DCI.CombineTo(N, BR, false);
22961
22962 return SDValue();
22963}
22964
22966 unsigned CC = N->getConstantOperandVal(2);
22967 SDValue SUBS = N->getOperand(3);
22968 SDValue Zero, CTTZ;
22969
22970 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22971 Zero = N->getOperand(0);
22972 CTTZ = N->getOperand(1);
22973 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22974 Zero = N->getOperand(1);
22975 CTTZ = N->getOperand(0);
22976 } else
22977 return SDValue();
22978
22979 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22980 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22981 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22982 return SDValue();
22983
22984 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22985 "Illegal type in CTTZ folding");
22986
22987 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
22988 return SDValue();
22989
22990 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22991 ? CTTZ.getOperand(0).getOperand(0)
22992 : CTTZ.getOperand(0);
22993
22994 if (X != SUBS.getOperand(0))
22995 return SDValue();
22996
22997 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22998 ? CTTZ.getOperand(0).getValueSizeInBits()
22999 : CTTZ.getValueSizeInBits();
23000 SDValue BitWidthMinusOne =
23001 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23002 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23003 BitWidthMinusOne);
23004}
23005
23006// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23007// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23008// Where x and y are constants and x != y
23009
23010// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23011// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23012// Where x and y are constants and x != y
23014 SDValue L = Op->getOperand(0);
23015 SDValue R = Op->getOperand(1);
23016 AArch64CC::CondCode OpCC =
23017 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23018
23019 SDValue OpCmp = Op->getOperand(3);
23020 if (!isCMP(OpCmp))
23021 return SDValue();
23022
23023 SDValue CmpLHS = OpCmp.getOperand(0);
23024 SDValue CmpRHS = OpCmp.getOperand(1);
23025
23026 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23027 std::swap(CmpLHS, CmpRHS);
23028 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23029 return SDValue();
23030
23031 SDValue X = CmpLHS->getOperand(0);
23032 SDValue Y = CmpLHS->getOperand(1);
23033 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23034 return SDValue();
23035 }
23036
23037 // If one of the constant is opaque constant, x,y sdnode is still different
23038 // but the real value maybe the same. So check APInt here to make sure the
23039 // code is correct.
23040 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23041 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23042 if (CX->getAPIntValue() == CY->getAPIntValue())
23043 return SDValue();
23044
23046 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23047 SDValue Cond = CmpLHS->getOperand(3);
23048
23049 if (CmpRHS == Y)
23051 else if (CmpRHS != X)
23052 return SDValue();
23053
23054 if (OpCC == AArch64CC::NE)
23056 else if (OpCC != AArch64CC::EQ)
23057 return SDValue();
23058
23059 SDLoc DL(Op);
23060 EVT VT = Op->getValueType(0);
23061
23062 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23063 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23064}
23065
23066// Optimize CSEL instructions
23069 SelectionDAG &DAG) {
23070 // CSEL x, x, cc -> x
23071 if (N->getOperand(0) == N->getOperand(1))
23072 return N->getOperand(0);
23073
23074 if (SDValue R = foldCSELOfCSEL(N, DAG))
23075 return R;
23076
23077 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23078 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23079 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23080 return Folded;
23081
23082 return performCONDCombine(N, DCI, DAG, 2, 3);
23083}
23084
23085// Try to re-use an already extended operand of a vector SetCC feeding a
23086// extended select. Doing so avoids requiring another full extension of the
23087// SET_CC result when lowering the select.
23089 EVT Op0MVT = Op->getOperand(0).getValueType();
23090 if (!Op0MVT.isVector() || Op->use_empty())
23091 return SDValue();
23092
23093 // Make sure that all uses of Op are VSELECTs with result matching types where
23094 // the result type has a larger element type than the SetCC operand.
23095 SDNode *FirstUse = *Op->use_begin();
23096 if (FirstUse->getOpcode() != ISD::VSELECT)
23097 return SDValue();
23098 EVT UseMVT = FirstUse->getValueType(0);
23099 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23100 return SDValue();
23101 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23102 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23103 }))
23104 return SDValue();
23105
23106 APInt V;
23107 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23108 return SDValue();
23109
23110 SDLoc DL(Op);
23111 SDValue Op0ExtV;
23112 SDValue Op1ExtV;
23113 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23114 // Check if the first operand of the SET_CC is already extended. If it is,
23115 // split the SET_CC and re-use the extended version of the operand.
23116 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23117 Op->getOperand(0));
23118 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23119 Op->getOperand(0));
23120 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23121 Op0ExtV = SDValue(Op0SExt, 0);
23122 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23123 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23124 Op0ExtV = SDValue(Op0ZExt, 0);
23125 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23126 } else
23127 return SDValue();
23128
23129 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23130 Op0ExtV, Op1ExtV, Op->getOperand(2));
23131}
23132
23133static SDValue
23135 SelectionDAG &DAG) {
23136 SDValue Vec = N->getOperand(0);
23137 if (DCI.isBeforeLegalize() &&
23138 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23141 SDLoc DL(N);
23142 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23143 DAG);
23144 }
23145
23146 return SDValue();
23147}
23148
23151 SelectionDAG &DAG) {
23152 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23153 SDValue LHS = N->getOperand(0);
23154 SDValue RHS = N->getOperand(1);
23155 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23156 SDLoc DL(N);
23157 EVT VT = N->getValueType(0);
23158
23159 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23160 return V;
23161
23162 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23163 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23164 LHS->getOpcode() == AArch64ISD::CSEL &&
23165 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23166 LHS->hasOneUse()) {
23167 // Invert CSEL's condition.
23168 auto OldCond =
23169 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23170 auto NewCond = getInvertedCondCode(OldCond);
23171
23172 // csel 0, 1, !cond, X
23173 SDValue CSEL =
23174 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23175 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23176 LHS.getOperand(3));
23177 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23178 }
23179
23180 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23181 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23182 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23183 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23184 LHS->hasOneUse()) {
23185 EVT TstVT = LHS->getValueType(0);
23186 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23187 // this pattern will get better opt in emitComparison
23188 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23189 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23190 DAG.getConstant(TstImm, DL, TstVT));
23191 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23192 }
23193 }
23194
23195 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23196 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23197 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23198 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23199 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23200 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23202 LHS->getOpcode() == ISD::BITCAST) {
23203 EVT ToVT = LHS->getValueType(0);
23204 EVT FromVT = LHS->getOperand(0).getValueType();
23205 if (FromVT.isFixedLengthVector() &&
23206 FromVT.getVectorElementType() == MVT::i1) {
23207 bool IsNull = isNullConstant(RHS);
23209 DL, MVT::i1, LHS->getOperand(0));
23210 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23211 LHS);
23212 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23213 }
23214 }
23215
23216 // Try to perform the memcmp when the result is tested for [in]equality with 0
23217 if (SDValue V = performOrXorChainCombine(N, DAG))
23218 return V;
23219
23220 return SDValue();
23221}
23222
23223// Replace a flag-setting operator (eg ANDS) with the generic version
23224// (eg AND) if the flag is unused.
23227 unsigned GenericOpcode) {
23228 SDLoc DL(N);
23229 SDValue LHS = N->getOperand(0);
23230 SDValue RHS = N->getOperand(1);
23231 EVT VT = N->getValueType(0);
23232
23233 // If the flag result isn't used, convert back to a generic opcode.
23234 if (!N->hasAnyUseOfValue(1)) {
23235 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23236 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23237 DL);
23238 }
23239
23240 // Combine identical generic nodes into this node, re-using the result.
23241 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23242 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23243 DCI.CombineTo(Generic, SDValue(N, 0));
23244
23245 return SDValue();
23246}
23247
23249 // setcc_merge_zero pred
23250 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23251 // => extract_subvector (inner setcc_merge_zero)
23252 SDValue Pred = N->getOperand(0);
23253 SDValue LHS = N->getOperand(1);
23254 SDValue RHS = N->getOperand(2);
23255 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23256
23257 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23258 LHS->getOpcode() != ISD::SIGN_EXTEND)
23259 return SDValue();
23260
23261 SDValue Extract = LHS->getOperand(0);
23262 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23263 Extract->getValueType(0) != N->getValueType(0) ||
23264 Extract->getConstantOperandVal(1) != 0)
23265 return SDValue();
23266
23267 SDValue InnerSetCC = Extract->getOperand(0);
23268 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23269 return SDValue();
23270
23271 // By this point we've effectively got
23272 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23273 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23274 // can operate on A directly.
23275 SDValue InnerPred = InnerSetCC.getOperand(0);
23276 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23277 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23278 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23279 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23280 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23281 return Extract;
23282
23283 return SDValue();
23284}
23285
23286static SDValue
23288 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23289 "Unexpected opcode!");
23290
23291 SelectionDAG &DAG = DCI.DAG;
23292 SDValue Pred = N->getOperand(0);
23293 SDValue LHS = N->getOperand(1);
23294 SDValue RHS = N->getOperand(2);
23295 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23296
23297 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23298 return V;
23299
23300 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23301 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23302 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23303 // setcc_merge_zero(
23304 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23305 // => setcc_merge_zero(pred, ...)
23306 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23307 LHS->getOperand(0)->getOperand(0) == Pred)
23308 return LHS->getOperand(0);
23309
23310 // setcc_merge_zero(
23311 // all_active, extend(nxvNi1 ...), != splat(0))
23312 // -> nxvNi1 ...
23313 if (isAllActivePredicate(DAG, Pred))
23314 return LHS->getOperand(0);
23315
23316 // setcc_merge_zero(
23317 // pred, extend(nxvNi1 ...), != splat(0))
23318 // -> nxvNi1 and(pred, ...)
23319 if (DCI.isAfterLegalizeDAG())
23320 // Do this after legalization to allow more folds on setcc_merge_zero
23321 // to be recognized.
23322 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23323 LHS->getOperand(0), Pred);
23324 }
23325
23326 return SDValue();
23327}
23328
23329// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23330// as well as whether the test should be inverted. This code is required to
23331// catch these cases (as opposed to standard dag combines) because
23332// AArch64ISD::TBZ is matched during legalization.
23333static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23334 SelectionDAG &DAG) {
23335
23336 if (!Op->hasOneUse())
23337 return Op;
23338
23339 // We don't handle undef/constant-fold cases below, as they should have
23340 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23341 // etc.)
23342
23343 // (tbz (trunc x), b) -> (tbz x, b)
23344 // This case is just here to enable more of the below cases to be caught.
23345 if (Op->getOpcode() == ISD::TRUNCATE &&
23346 Bit < Op->getValueType(0).getSizeInBits()) {
23347 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23348 }
23349
23350 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23351 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23352 Bit < Op->getOperand(0).getValueSizeInBits()) {
23353 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23354 }
23355
23356 if (Op->getNumOperands() != 2)
23357 return Op;
23358
23359 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23360 if (!C)
23361 return Op;
23362
23363 switch (Op->getOpcode()) {
23364 default:
23365 return Op;
23366
23367 // (tbz (and x, m), b) -> (tbz x, b)
23368 case ISD::AND:
23369 if ((C->getZExtValue() >> Bit) & 1)
23370 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23371 return Op;
23372
23373 // (tbz (shl x, c), b) -> (tbz x, b-c)
23374 case ISD::SHL:
23375 if (C->getZExtValue() <= Bit &&
23376 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23377 Bit = Bit - C->getZExtValue();
23378 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23379 }
23380 return Op;
23381
23382 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23383 case ISD::SRA:
23384 Bit = Bit + C->getZExtValue();
23385 if (Bit >= Op->getValueType(0).getSizeInBits())
23386 Bit = Op->getValueType(0).getSizeInBits() - 1;
23387 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23388
23389 // (tbz (srl x, c), b) -> (tbz x, b+c)
23390 case ISD::SRL:
23391 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23392 Bit = Bit + C->getZExtValue();
23393 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23394 }
23395 return Op;
23396
23397 // (tbz (xor x, -1), b) -> (tbnz x, b)
23398 case ISD::XOR:
23399 if ((C->getZExtValue() >> Bit) & 1)
23400 Invert = !Invert;
23401 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23402 }
23403}
23404
23405// Optimize test single bit zero/non-zero and branch.
23408 SelectionDAG &DAG) {
23409 unsigned Bit = N->getConstantOperandVal(2);
23410 bool Invert = false;
23411 SDValue TestSrc = N->getOperand(1);
23412 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23413
23414 if (TestSrc == NewTestSrc)
23415 return SDValue();
23416
23417 unsigned NewOpc = N->getOpcode();
23418 if (Invert) {
23419 if (NewOpc == AArch64ISD::TBZ)
23420 NewOpc = AArch64ISD::TBNZ;
23421 else {
23422 assert(NewOpc == AArch64ISD::TBNZ);
23423 NewOpc = AArch64ISD::TBZ;
23424 }
23425 }
23426
23427 SDLoc DL(N);
23428 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23429 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23430}
23431
23432// Swap vselect operands where it may allow a predicated operation to achieve
23433// the `sel`.
23434//
23435// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23436// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23438 auto SelectA = N->getOperand(1);
23439 auto SelectB = N->getOperand(2);
23440 auto NTy = N->getValueType(0);
23441
23442 if (!NTy.isScalableVector())
23443 return SDValue();
23444 SDValue SetCC = N->getOperand(0);
23445 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23446 return SDValue();
23447
23448 switch (SelectB.getOpcode()) {
23449 default:
23450 return SDValue();
23451 case ISD::FMUL:
23452 case ISD::FSUB:
23453 case ISD::FADD:
23454 break;
23455 }
23456 if (SelectA != SelectB.getOperand(0))
23457 return SDValue();
23458
23459 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23460 ISD::CondCode InverseCC =
23462 auto InverseSetCC =
23463 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23464 SetCC.getOperand(1), InverseCC);
23465
23466 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23467 {InverseSetCC, SelectB, SelectA});
23468}
23469
23470// vselect (v1i1 setcc) ->
23471// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23472// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23473// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23474// such VSELECT.
23476 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23477 return SwapResult;
23478
23479 SDValue N0 = N->getOperand(0);
23480 EVT CCVT = N0.getValueType();
23481
23482 if (isAllActivePredicate(DAG, N0))
23483 return N->getOperand(1);
23484
23485 if (isAllInactivePredicate(N0))
23486 return N->getOperand(2);
23487
23488 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23489 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23490 // supported types.
23491 SDValue SetCC = N->getOperand(0);
23492 if (SetCC.getOpcode() == ISD::SETCC &&
23493 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23494 SDValue CmpLHS = SetCC.getOperand(0);
23495 EVT VT = CmpLHS.getValueType();
23496 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23497 SDNode *SplatLHS = N->getOperand(1).getNode();
23498 SDNode *SplatRHS = N->getOperand(2).getNode();
23499 APInt SplatLHSVal;
23500 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23501 VT.isSimple() &&
23502 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23503 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23504 VT.getSimpleVT().SimpleTy) &&
23505 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23506 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23508 unsigned NumElts = VT.getVectorNumElements();
23510 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23511 VT.getScalarType()));
23512 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23513
23514 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23515 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23516 return Or;
23517 }
23518 }
23519
23520 EVT CmpVT = N0.getOperand(0).getValueType();
23521 if (N0.getOpcode() != ISD::SETCC ||
23523 CCVT.getVectorElementType() != MVT::i1 ||
23525 return SDValue();
23526
23527 EVT ResVT = N->getValueType(0);
23528 // Only combine when the result type is of the same size as the compared
23529 // operands.
23530 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23531 return SDValue();
23532
23533 SDValue IfTrue = N->getOperand(1);
23534 SDValue IfFalse = N->getOperand(2);
23535 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23536 N0.getOperand(0), N0.getOperand(1),
23537 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23538 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23539 IfTrue, IfFalse);
23540}
23541
23542/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23543/// the compare-mask instructions rather than going via NZCV, even if LHS and
23544/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23545/// with a vector one followed by a DUP shuffle on the result.
23548 SelectionDAG &DAG = DCI.DAG;
23549 SDValue N0 = N->getOperand(0);
23550 EVT ResVT = N->getValueType(0);
23551
23552 if (N0.getOpcode() != ISD::SETCC)
23553 return SDValue();
23554
23555 if (ResVT.isScalableVT())
23556 return SDValue();
23557
23558 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23559 // scalar SetCCResultType. We also don't expect vectors, because we assume
23560 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23561 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23562 "Scalar-SETCC feeding SELECT has unexpected result type!");
23563
23564 // If NumMaskElts == 0, the comparison is larger than select result. The
23565 // largest real NEON comparison is 64-bits per lane, which means the result is
23566 // at most 32-bits and an illegal vector. Just bail out for now.
23567 EVT SrcVT = N0.getOperand(0).getValueType();
23568
23569 // Don't try to do this optimization when the setcc itself has i1 operands.
23570 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23571 // ruled out to prevent the creation of setcc that need to be scalarized.
23572 if (SrcVT == MVT::i1 ||
23573 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23574 return SDValue();
23575
23576 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23577 if (!ResVT.isVector() || NumMaskElts == 0)
23578 return SDValue();
23579
23580 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23582
23583 // Also bail out if the vector CCVT isn't the same size as ResVT.
23584 // This can happen if the SETCC operand size doesn't divide the ResVT size
23585 // (e.g., f64 vs v3f32).
23586 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23587 return SDValue();
23588
23589 // Make sure we didn't create illegal types, if we're not supposed to.
23590 assert(DCI.isBeforeLegalize() ||
23591 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23592
23593 // First perform a vector comparison, where lane 0 is the one we're interested
23594 // in.
23595 SDLoc DL(N0);
23596 SDValue LHS =
23597 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23598 SDValue RHS =
23599 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23600 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23601
23602 // Now duplicate the comparison mask we want across all other lanes.
23603 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23604 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23605 Mask = DAG.getNode(ISD::BITCAST, DL,
23606 ResVT.changeVectorElementTypeToInteger(), Mask);
23607
23608 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23609}
23610
23613 EVT VT = N->getValueType(0);
23614 SDLoc DL(N);
23615 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23616 // 128bit vector version.
23617 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23619 SmallVector<SDValue> Ops(N->ops());
23620 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23621 DCI.DAG.getVTList(LVT), Ops)) {
23622 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23623 DCI.DAG.getConstant(0, DL, MVT::i64));
23624 }
23625 }
23626
23627 if (N->getOpcode() == AArch64ISD::DUP) {
23628 if (DCI.isAfterLegalizeDAG()) {
23629 // If scalar dup's operand is extract_vector_elt, try to combine them into
23630 // duplane. For example,
23631 //
23632 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23633 // t18: v4i32 = AArch64ISD::DUP t21
23634 // ==>
23635 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23636 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23637 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23638 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23639 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23640 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23641 EXTRACT_VEC_ELT.getOperand(1));
23642 }
23643 }
23644 }
23645
23646 return performPostLD1Combine(N, DCI, false);
23647 }
23648
23649 return SDValue();
23650}
23651
23652/// Get rid of unnecessary NVCASTs (that don't change the type).
23654 if (N->getValueType(0) == N->getOperand(0).getValueType())
23655 return N->getOperand(0);
23656 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23657 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23658 N->getOperand(0).getOperand(0));
23659
23660 return SDValue();
23661}
23662
23663// If all users of the globaladdr are of the form (globaladdr + constant), find
23664// the smallest constant, fold it into the globaladdr's offset and rewrite the
23665// globaladdr as (globaladdr + constant) - constant.
23667 const AArch64Subtarget *Subtarget,
23668 const TargetMachine &TM) {
23669 auto *GN = cast<GlobalAddressSDNode>(N);
23670 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23672 return SDValue();
23673
23674 uint64_t MinOffset = -1ull;
23675 for (SDNode *N : GN->uses()) {
23676 if (N->getOpcode() != ISD::ADD)
23677 return SDValue();
23678 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23679 if (!C)
23680 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23681 if (!C)
23682 return SDValue();
23683 MinOffset = std::min(MinOffset, C->getZExtValue());
23684 }
23685 uint64_t Offset = MinOffset + GN->getOffset();
23686
23687 // Require that the new offset is larger than the existing one. Otherwise, we
23688 // can end up oscillating between two possible DAGs, for example,
23689 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23690 if (Offset <= uint64_t(GN->getOffset()))
23691 return SDValue();
23692
23693 // Check whether folding this offset is legal. It must not go out of bounds of
23694 // the referenced object to avoid violating the code model, and must be
23695 // smaller than 2^20 because this is the largest offset expressible in all
23696 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23697 // stores an immediate signed 21 bit offset.)
23698 //
23699 // This check also prevents us from folding negative offsets, which will end
23700 // up being treated in the same way as large positive ones. They could also
23701 // cause code model violations, and aren't really common enough to matter.
23702 if (Offset >= (1 << 20))
23703 return SDValue();
23704
23705 const GlobalValue *GV = GN->getGlobal();
23706 Type *T = GV->getValueType();
23707 if (!T->isSized() ||
23709 return SDValue();
23710
23711 SDLoc DL(GN);
23712 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23713 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23714 DAG.getConstant(MinOffset, DL, MVT::i64));
23715}
23716
23718 const AArch64Subtarget *Subtarget) {
23719 SDValue BR = N->getOperand(0);
23720 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23721 !BR.getValueType().isScalarInteger())
23722 return SDValue();
23723
23724 SDLoc DL(N);
23725 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23726}
23727
23728// Turns the vector of indices into a vector of byte offstes by scaling Offset
23729// by (BitWidth / 8).
23731 SDLoc DL, unsigned BitWidth) {
23732 assert(Offset.getValueType().isScalableVector() &&
23733 "This method is only for scalable vectors of offsets");
23734
23735 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23736 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23737
23738 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23739}
23740
23741/// Check if the value of \p OffsetInBytes can be used as an immediate for
23742/// the gather load/prefetch and scatter store instructions with vector base and
23743/// immediate offset addressing mode:
23744///
23745/// [<Zn>.[S|D]{, #<imm>}]
23746///
23747/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23748inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23749 unsigned ScalarSizeInBytes) {
23750 // The immediate is not a multiple of the scalar size.
23751 if (OffsetInBytes % ScalarSizeInBytes)
23752 return false;
23753
23754 // The immediate is out of range.
23755 if (OffsetInBytes / ScalarSizeInBytes > 31)
23756 return false;
23757
23758 return true;
23759}
23760
23761/// Check if the value of \p Offset represents a valid immediate for the SVE
23762/// gather load/prefetch and scatter store instructiona with vector base and
23763/// immediate offset addressing mode:
23764///
23765/// [<Zn>.[S|D]{, #<imm>}]
23766///
23767/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23769 unsigned ScalarSizeInBytes) {
23770 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23771 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23772 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23773}
23774
23776 unsigned Opcode,
23777 bool OnlyPackedOffsets = true) {
23778 const SDValue Src = N->getOperand(2);
23779 const EVT SrcVT = Src->getValueType(0);
23780 assert(SrcVT.isScalableVector() &&
23781 "Scatter stores are only possible for SVE vectors");
23782
23783 SDLoc DL(N);
23784 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23785
23786 // Make sure that source data will fit into an SVE register
23788 return SDValue();
23789
23790 // For FPs, ACLE only supports _packed_ single and double precision types.
23791 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23792 if (SrcElVT.isFloatingPoint())
23793 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23794 ((Opcode != AArch64ISD::SST1Q_PRED &&
23795 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23796 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23797 return SDValue();
23798
23799 // Depending on the addressing mode, this is either a pointer or a vector of
23800 // pointers (that fits into one register)
23801 SDValue Base = N->getOperand(4);
23802 // Depending on the addressing mode, this is either a single offset or a
23803 // vector of offsets (that fits into one register)
23804 SDValue Offset = N->getOperand(5);
23805
23806 // For "scalar + vector of indices", just scale the indices. This only
23807 // applies to non-temporal scatters because there's no instruction that takes
23808 // indicies.
23809 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23810 Offset =
23812 Opcode = AArch64ISD::SSTNT1_PRED;
23813 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23814 Offset =
23816 Opcode = AArch64ISD::SST1Q_PRED;
23817 }
23818
23819 // In the case of non-temporal gather loads there's only one SVE instruction
23820 // per data-size: "scalar + vector", i.e.
23821 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23822 // Since we do have intrinsics that allow the arguments to be in a different
23823 // order, we may need to swap them to match the spec.
23824 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23825 Offset.getValueType().isVector())
23827
23828 // SST1_IMM requires that the offset is an immediate that is:
23829 // * a multiple of #SizeInBytes,
23830 // * in the range [0, 31 x #SizeInBytes],
23831 // where #SizeInBytes is the size in bytes of the stored items. For
23832 // immediates outside that range and non-immediate scalar offsets use SST1 or
23833 // SST1_UXTW instead.
23834 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23836 SrcVT.getScalarSizeInBits() / 8)) {
23837 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23839 else
23840 Opcode = AArch64ISD::SST1_PRED;
23841
23843 }
23844 }
23845
23846 auto &TLI = DAG.getTargetLoweringInfo();
23847 if (!TLI.isTypeLegal(Base.getValueType()))
23848 return SDValue();
23849
23850 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23851 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23852 // nxv2i64. Legalize accordingly.
23853 if (!OnlyPackedOffsets &&
23854 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23855 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23856
23857 if (!TLI.isTypeLegal(Offset.getValueType()))
23858 return SDValue();
23859
23860 // Source value type that is representable in hardware
23861 EVT HwSrcVt = getSVEContainerType(SrcVT);
23862
23863 // Keep the original type of the input data to store - this is needed to be
23864 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23865 // FP values we want the integer equivalent, so just use HwSrcVt.
23866 SDValue InputVT = DAG.getValueType(SrcVT);
23867 if (SrcVT.isFloatingPoint())
23868 InputVT = DAG.getValueType(HwSrcVt);
23869
23870 SDVTList VTs = DAG.getVTList(MVT::Other);
23871 SDValue SrcNew;
23872
23873 if (Src.getValueType().isFloatingPoint())
23874 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23875 else
23876 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23877
23878 SDValue Ops[] = {N->getOperand(0), // Chain
23879 SrcNew,
23880 N->getOperand(3), // Pg
23881 Base,
23882 Offset,
23883 InputVT};
23884
23885 return DAG.getNode(Opcode, DL, VTs, Ops);
23886}
23887
23889 unsigned Opcode,
23890 bool OnlyPackedOffsets = true) {
23891 const EVT RetVT = N->getValueType(0);
23892 assert(RetVT.isScalableVector() &&
23893 "Gather loads are only possible for SVE vectors");
23894
23895 SDLoc DL(N);
23896
23897 // Make sure that the loaded data will fit into an SVE register
23899 return SDValue();
23900
23901 // Depending on the addressing mode, this is either a pointer or a vector of
23902 // pointers (that fits into one register)
23903 SDValue Base = N->getOperand(3);
23904 // Depending on the addressing mode, this is either a single offset or a
23905 // vector of offsets (that fits into one register)
23906 SDValue Offset = N->getOperand(4);
23907
23908 // For "scalar + vector of indices", scale the indices to obtain unscaled
23909 // offsets. This applies to non-temporal and quadword gathers, which do not
23910 // have an addressing mode with scaled offset.
23913 RetVT.getScalarSizeInBits());
23915 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23917 RetVT.getScalarSizeInBits());
23919 }
23920
23921 // In the case of non-temporal gather loads and quadword gather loads there's
23922 // only one addressing mode : "vector + scalar", e.g.
23923 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23924 // Since we do have intrinsics that allow the arguments to be in a different
23925 // order, we may need to swap them to match the spec.
23926 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23927 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23928 Offset.getValueType().isVector())
23930
23931 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23932 // * a multiple of #SizeInBytes,
23933 // * in the range [0, 31 x #SizeInBytes],
23934 // where #SizeInBytes is the size in bytes of the loaded items. For
23935 // immediates outside that range and non-immediate scalar offsets use
23936 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23937 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23940 RetVT.getScalarSizeInBits() / 8)) {
23941 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23942 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23945 else
23946 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23949
23951 }
23952 }
23953
23954 auto &TLI = DAG.getTargetLoweringInfo();
23955 if (!TLI.isTypeLegal(Base.getValueType()))
23956 return SDValue();
23957
23958 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23959 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23960 // nxv2i64. Legalize accordingly.
23961 if (!OnlyPackedOffsets &&
23962 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23963 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23964
23965 // Return value type that is representable in hardware
23966 EVT HwRetVt = getSVEContainerType(RetVT);
23967
23968 // Keep the original output value type around - this is needed to be able to
23969 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23970 // values we want the integer equivalent, so just use HwRetVT.
23971 SDValue OutVT = DAG.getValueType(RetVT);
23972 if (RetVT.isFloatingPoint())
23973 OutVT = DAG.getValueType(HwRetVt);
23974
23975 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23976 SDValue Ops[] = {N->getOperand(0), // Chain
23977 N->getOperand(2), // Pg
23978 Base, Offset, OutVT};
23979
23980 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23981 SDValue LoadChain = SDValue(Load.getNode(), 1);
23982
23983 if (RetVT.isInteger() && (RetVT != HwRetVt))
23984 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
23985
23986 // If the original return value was FP, bitcast accordingly. Doing it here
23987 // means that we can avoid adding TableGen patterns for FPs.
23988 if (RetVT.isFloatingPoint())
23989 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
23990
23991 return DAG.getMergeValues({Load, LoadChain}, DL);
23992}
23993
23994static SDValue
23996 SelectionDAG &DAG) {
23997 SDLoc DL(N);
23998 SDValue Src = N->getOperand(0);
23999 unsigned Opc = Src->getOpcode();
24000
24001 // Sign extend of an unsigned unpack -> signed unpack
24002 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24003
24004 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24006
24007 // Push the sign extend to the operand of the unpack
24008 // This is necessary where, for example, the operand of the unpack
24009 // is another unpack:
24010 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24011 // ->
24012 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24013 // ->
24014 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24015 SDValue ExtOp = Src->getOperand(0);
24016 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24017 EVT EltTy = VT.getVectorElementType();
24018 (void)EltTy;
24019
24020 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24021 "Sign extending from an invalid type");
24022
24023 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24024
24026 ExtOp, DAG.getValueType(ExtVT));
24027
24028 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24029 }
24030
24031 if (DCI.isBeforeLegalizeOps())
24032 return SDValue();
24033
24035 return SDValue();
24036
24037 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24038 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24039 unsigned NewOpc;
24040 unsigned MemVTOpNum = 4;
24041 switch (Opc) {
24044 MemVTOpNum = 3;
24045 break;
24048 MemVTOpNum = 3;
24049 break;
24052 MemVTOpNum = 3;
24053 break;
24056 break;
24059 break;
24062 break;
24065 break;
24068 break;
24071 break;
24074 break;
24077 break;
24080 break;
24083 break;
24086 break;
24089 break;
24092 break;
24095 break;
24098 break;
24099 default:
24100 return SDValue();
24101 }
24102
24103 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24104 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24105
24106 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24107 return SDValue();
24108
24109 EVT DstVT = N->getValueType(0);
24110 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24111
24113 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24114 Ops.push_back(Src->getOperand(I));
24115
24116 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24117 DCI.CombineTo(N, ExtLoad);
24118 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24119
24120 // Return N so it doesn't get rechecked
24121 return SDValue(N, 0);
24122}
24123
24124/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24125/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24126/// != nxv2i32) do not need legalization.
24128 const unsigned OffsetPos = 4;
24129 SDValue Offset = N->getOperand(OffsetPos);
24130
24131 // Not an unpacked vector, bail out.
24132 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24133 return SDValue();
24134
24135 // Extend the unpacked offset vector to 64-bit lanes.
24136 SDLoc DL(N);
24137 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24138 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24139 // Replace the offset operand with the 64-bit one.
24140 Ops[OffsetPos] = Offset;
24141
24142 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24143}
24144
24145/// Combines a node carrying the intrinsic
24146/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24147/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24148/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24149/// sve gather prefetch instruction with vector plus immediate addressing mode.
24151 unsigned ScalarSizeInBytes) {
24152 const unsigned ImmPos = 4, OffsetPos = 3;
24153 // No need to combine the node if the immediate is valid...
24154 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24155 return SDValue();
24156
24157 // ...otherwise swap the offset base with the offset...
24158 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24159 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24160 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24161 // `aarch64_sve_prfb_gather_uxtw_index`.
24162 SDLoc DL(N);
24163 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24164 MVT::i64);
24165
24166 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24167}
24168
24169// Return true if the vector operation can guarantee only the first lane of its
24170// result contains data, with all bits in other lanes set to zero.
24172 switch (Op.getOpcode()) {
24173 default:
24174 return false;
24190 return true;
24191 }
24192}
24193
24195 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24196 SDValue InsertVec = N->getOperand(0);
24197 SDValue InsertElt = N->getOperand(1);
24198 SDValue InsertIdx = N->getOperand(2);
24199
24200 // We only care about inserts into the first element...
24201 if (!isNullConstant(InsertIdx))
24202 return SDValue();
24203 // ...of a zero'd vector...
24205 return SDValue();
24206 // ...where the inserted data was previously extracted...
24207 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24208 return SDValue();
24209
24210 SDValue ExtractVec = InsertElt.getOperand(0);
24211 SDValue ExtractIdx = InsertElt.getOperand(1);
24212
24213 // ...from the first element of a vector.
24214 if (!isNullConstant(ExtractIdx))
24215 return SDValue();
24216
24217 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24218
24219 // Ensure there's no type conversion going on.
24220 if (N->getValueType(0) != ExtractVec.getValueType())
24221 return SDValue();
24222
24223 if (!isLanes1toNKnownZero(ExtractVec))
24224 return SDValue();
24225
24226 // The explicit zeroing is redundant.
24227 return ExtractVec;
24228}
24229
24230static SDValue
24233 return Res;
24234
24235 return performPostLD1Combine(N, DCI, true);
24236}
24237
24239 EVT Ty = N->getValueType(0);
24240 if (Ty.isInteger())
24241 return SDValue();
24242
24245 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24247 return SDValue();
24248
24249 SDLoc DL(N);
24250 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
24251 DL, ExtIntTy);
24252 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
24253 DL, ExtIntTy);
24254 SDValue Idx = N->getOperand(2);
24255 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
24256 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
24257 return DAG.getBitcast(Ty, Trunc);
24258}
24259
24262 const AArch64Subtarget *Subtarget) {
24263 SDValue N0 = N->getOperand(0);
24264 EVT VT = N->getValueType(0);
24265
24266 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24267 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24268 return SDValue();
24269
24270 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24271 EVT EltVT = VT.getVectorElementType();
24272 return EltVT == MVT::f32 || EltVT == MVT::f64;
24273 };
24274
24275 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24276 // We purposefully don't care about legality of the nodes here as we know
24277 // they can be split down into something legal.
24278 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24279 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24280 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24281 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24282 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24283 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24284 LN0->getChain(), LN0->getBasePtr(),
24285 N0.getValueType(), LN0->getMemOperand());
24286 DCI.CombineTo(N, ExtLoad);
24287 DCI.CombineTo(
24288 N0.getNode(),
24289 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24290 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24291 ExtLoad.getValue(1));
24292 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24293 }
24294
24295 return SDValue();
24296}
24297
24299 const AArch64Subtarget *Subtarget) {
24300 EVT VT = N->getValueType(0);
24301
24302 // Don't expand for NEON, SVE2 or SME
24303 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24304 return SDValue();
24305
24306 SDLoc DL(N);
24307
24308 SDValue Mask = N->getOperand(0);
24309 SDValue In1 = N->getOperand(1);
24310 SDValue In2 = N->getOperand(2);
24311
24312 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24313 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24314 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24315 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24316}
24317
24319 EVT VT = N->getValueType(0);
24320
24321 SDValue Insert = N->getOperand(0);
24322 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24323 return SDValue();
24324
24325 if (!Insert.getOperand(0).isUndef())
24326 return SDValue();
24327
24328 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24329 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24330 if (IdxInsert != 0 || IdxDupLane != 0)
24331 return SDValue();
24332
24333 SDValue Bitcast = Insert.getOperand(1);
24334 if (Bitcast.getOpcode() != ISD::BITCAST)
24335 return SDValue();
24336
24337 SDValue Subvec = Bitcast.getOperand(0);
24338 EVT SubvecVT = Subvec.getValueType();
24339 if (!SubvecVT.is128BitVector())
24340 return SDValue();
24341 EVT NewSubvecVT =
24343
24344 SDLoc DL(N);
24345 SDValue NewInsert =
24346 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24347 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24348 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24349 NewInsert, N->getOperand(1));
24350 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24351}
24352
24353// Try to combine mull with uzp1.
24356 SelectionDAG &DAG) {
24357 if (DCI.isBeforeLegalizeOps())
24358 return SDValue();
24359
24360 SDValue LHS = N->getOperand(0);
24361 SDValue RHS = N->getOperand(1);
24362
24363 SDValue ExtractHigh;
24364 SDValue ExtractLow;
24365 SDValue TruncHigh;
24366 SDValue TruncLow;
24367 SDLoc DL(N);
24368
24369 // Check the operands are trunc and extract_high.
24371 RHS.getOpcode() == ISD::TRUNCATE) {
24372 TruncHigh = RHS;
24373 if (LHS.getOpcode() == ISD::BITCAST)
24374 ExtractHigh = LHS.getOperand(0);
24375 else
24376 ExtractHigh = LHS;
24378 LHS.getOpcode() == ISD::TRUNCATE) {
24379 TruncHigh = LHS;
24380 if (LHS.getOpcode() == ISD::BITCAST)
24381 ExtractHigh = RHS.getOperand(0);
24382 else
24383 ExtractHigh = RHS;
24384 } else
24385 return SDValue();
24386
24387 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24388 // with uzp1.
24389 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24390 SDValue TruncHighOp = TruncHigh.getOperand(0);
24391 EVT TruncHighOpVT = TruncHighOp.getValueType();
24392 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24393 DAG.isSplatValue(TruncHighOp, false))
24394 return SDValue();
24395
24396 // Check there is other extract_high with same source vector.
24397 // For example,
24398 //
24399 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24400 // t12: v4i16 = truncate t11
24401 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24402 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24403 // t16: v4i16 = truncate t15
24404 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24405 //
24406 // This dagcombine assumes the two extract_high uses same source vector in
24407 // order to detect the pair of the mull. If they have different source vector,
24408 // this code will not work.
24409 bool HasFoundMULLow = true;
24410 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24411 if (ExtractHighSrcVec->use_size() != 2)
24412 HasFoundMULLow = false;
24413
24414 // Find ExtractLow.
24415 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24416 if (User == ExtractHigh.getNode())
24417 continue;
24418
24419 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24421 HasFoundMULLow = false;
24422 break;
24423 }
24424
24425 ExtractLow.setNode(User);
24426 }
24427
24428 if (!ExtractLow || !ExtractLow->hasOneUse())
24429 HasFoundMULLow = false;
24430
24431 // Check ExtractLow's user.
24432 if (HasFoundMULLow) {
24433 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24434 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24435 HasFoundMULLow = false;
24436 } else {
24437 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24438 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24439 TruncLow = ExtractLowUser->getOperand(1);
24440 else
24441 HasFoundMULLow = false;
24442 } else {
24443 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24444 TruncLow = ExtractLowUser->getOperand(0);
24445 else
24446 HasFoundMULLow = false;
24447 }
24448 }
24449 }
24450
24451 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24452 // with uzp1.
24453 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24454 EVT TruncHighVT = TruncHigh.getValueType();
24455 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24456 SDValue TruncLowOp =
24457 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24458 EVT TruncLowOpVT = TruncLowOp.getValueType();
24459 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24460 DAG.isSplatValue(TruncLowOp, false)))
24461 return SDValue();
24462
24463 // Create uzp1, extract_high and extract_low.
24464 if (TruncHighOpVT != UZP1VT)
24465 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24466 if (TruncLowOpVT != UZP1VT)
24467 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24468
24469 SDValue UZP1 =
24470 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24471 SDValue HighIdxCst =
24472 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24473 SDValue NewTruncHigh =
24474 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24475 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24476
24477 if (HasFoundMULLow) {
24478 EVT TruncLowVT = TruncLow.getValueType();
24479 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24480 UZP1, ExtractLow.getOperand(1));
24481 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24482 }
24483
24484 return SDValue(N, 0);
24485}
24486
24489 SelectionDAG &DAG) {
24490 if (SDValue Val =
24492 return Val;
24493
24494 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24495 return Val;
24496
24497 return SDValue();
24498}
24499
24500static SDValue
24502 SelectionDAG &DAG) {
24503 // Let's do below transform.
24504 //
24505 // t34: v4i32 = AArch64ISD::UADDLV t2
24506 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24507 // t7: i64 = zero_extend t35
24508 // t20: v1i64 = scalar_to_vector t7
24509 // ==>
24510 // t34: v4i32 = AArch64ISD::UADDLV t2
24511 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24512 // t40: v1i64 = AArch64ISD::NVCAST t39
24513 if (DCI.isBeforeLegalizeOps())
24514 return SDValue();
24515
24516 EVT VT = N->getValueType(0);
24517 if (VT != MVT::v1i64)
24518 return SDValue();
24519
24520 SDValue ZEXT = N->getOperand(0);
24521 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24522 return SDValue();
24523
24524 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24525 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24526 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24527 return SDValue();
24528
24529 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24530 return SDValue();
24531
24532 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24533 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24534 UADDLV.getValueType() != MVT::v4i32 ||
24535 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24536 return SDValue();
24537
24538 // Let's generate new sequence with AArch64ISD::NVCAST.
24539 SDLoc DL(N);
24540 SDValue EXTRACT_SUBVEC =
24541 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24542 DAG.getConstant(0, DL, MVT::i64));
24543 SDValue NVCAST =
24544 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24545
24546 return NVCAST;
24547}
24548
24550 DAGCombinerInfo &DCI) const {
24551 SelectionDAG &DAG = DCI.DAG;
24552 switch (N->getOpcode()) {
24553 default:
24554 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24555 break;
24556 case ISD::VECREDUCE_AND:
24557 case ISD::VECREDUCE_OR:
24558 case ISD::VECREDUCE_XOR:
24559 return performVecReduceBitwiseCombine(N, DCI, DAG);
24560 case ISD::ADD:
24561 case ISD::SUB:
24562 return performAddSubCombine(N, DCI);
24563 case ISD::BUILD_VECTOR:
24564 return performBuildVectorCombine(N, DCI, DAG);
24565 case ISD::TRUNCATE:
24566 return performTruncateCombine(N, DAG);
24567 case AArch64ISD::ANDS:
24568 return performFlagSettingCombine(N, DCI, ISD::AND);
24569 case AArch64ISD::ADC:
24570 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24571 return R;
24572 return foldADCToCINC(N, DAG);
24573 case AArch64ISD::SBC:
24574 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24575 case AArch64ISD::ADCS:
24576 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24577 return R;
24579 case AArch64ISD::SBCS:
24580 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24581 return R;
24583 case ISD::XOR:
24584 return performXorCombine(N, DAG, DCI, Subtarget);
24585 case ISD::MUL:
24586 return performMulCombine(N, DAG, DCI, Subtarget);
24587 case ISD::SINT_TO_FP:
24588 case ISD::UINT_TO_FP:
24589 return performIntToFpCombine(N, DAG, Subtarget);
24590 case ISD::FP_TO_SINT:
24591 case ISD::FP_TO_UINT:
24594 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24595 case ISD::FDIV:
24596 return performFDivCombine(N, DAG, DCI, Subtarget);
24597 case ISD::OR:
24598 return performORCombine(N, DCI, Subtarget, *this);
24599 case ISD::AND:
24600 return performANDCombine(N, DCI);
24601 case ISD::FADD:
24602 return performFADDCombine(N, DCI);
24604 return performIntrinsicCombine(N, DCI, Subtarget);
24605 case ISD::ANY_EXTEND:
24606 case ISD::ZERO_EXTEND:
24607 case ISD::SIGN_EXTEND:
24608 return performExtendCombine(N, DCI, DAG);
24610 return performSignExtendInRegCombine(N, DCI, DAG);
24612 return performConcatVectorsCombine(N, DCI, DAG);
24614 return performExtractSubvectorCombine(N, DCI, DAG);
24616 return performInsertSubvectorCombine(N, DCI, DAG);
24617 case ISD::SELECT:
24618 return performSelectCombine(N, DCI);
24619 case ISD::VSELECT:
24620 return performVSelectCombine(N, DCI.DAG);
24621 case ISD::SETCC:
24622 return performSETCCCombine(N, DCI, DAG);
24623 case ISD::LOAD:
24624 return performLOADCombine(N, DCI, DAG, Subtarget);
24625 case ISD::STORE:
24626 return performSTORECombine(N, DCI, DAG, Subtarget);
24627 case ISD::MSTORE:
24628 return performMSTORECombine(N, DCI, DAG, Subtarget);
24629 case ISD::MGATHER:
24630 case ISD::MSCATTER:
24631 return performMaskedGatherScatterCombine(N, DCI, DAG);
24632 case ISD::VECTOR_SPLICE:
24633 return performSVESpliceCombine(N, DAG);
24634 case ISD::FP_EXTEND:
24635 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24636 case AArch64ISD::BRCOND:
24637 return performBRCONDCombine(N, DCI, DAG);
24638 case AArch64ISD::TBNZ:
24639 case AArch64ISD::TBZ:
24640 return performTBZCombine(N, DCI, DAG);
24641 case AArch64ISD::CSEL:
24642 return performCSELCombine(N, DCI, DAG);
24643 case AArch64ISD::DUP:
24648 return performDUPCombine(N, DCI);
24650 return performDupLane128Combine(N, DAG);
24651 case AArch64ISD::NVCAST:
24652 return performNVCASTCombine(N, DAG);
24653 case AArch64ISD::SPLICE:
24654 return performSpliceCombine(N, DAG);
24657 return performUnpackCombine(N, DAG, Subtarget);
24658 case AArch64ISD::UZP1:
24659 return performUzpCombine(N, DAG, Subtarget);
24661 return performSetccMergeZeroCombine(N, DCI);
24678 return performGLD1Combine(N, DAG);
24679 case AArch64ISD::VASHR:
24680 case AArch64ISD::VLSHR:
24681 return performVectorShiftCombine(N, *this, DCI);
24683 return performSunpkloCombine(N, DAG);
24684 case AArch64ISD::BSP:
24685 return performBSPExpandForSVE(N, DAG, Subtarget);
24687 return performInsertVectorEltCombine(N, DCI);
24689 return performExtractVectorEltCombine(N, DCI, Subtarget);
24690 case ISD::VECREDUCE_ADD:
24691 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24692 case AArch64ISD::UADDV:
24693 return performUADDVCombine(N, DAG);
24694 case AArch64ISD::SMULL:
24695 case AArch64ISD::UMULL:
24696 case AArch64ISD::PMULL:
24697 return performMULLCombine(N, DCI, DAG);
24700 switch (N->getConstantOperandVal(1)) {
24701 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24702 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24703 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24704 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24705 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24706 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24707 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24708 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24709 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24710 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24711 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24712 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24713 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24714 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24715 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24716 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24718 case Intrinsic::aarch64_neon_ld2:
24719 case Intrinsic::aarch64_neon_ld3:
24720 case Intrinsic::aarch64_neon_ld4:
24721 case Intrinsic::aarch64_neon_ld1x2:
24722 case Intrinsic::aarch64_neon_ld1x3:
24723 case Intrinsic::aarch64_neon_ld1x4:
24724 case Intrinsic::aarch64_neon_ld2lane:
24725 case Intrinsic::aarch64_neon_ld3lane:
24726 case Intrinsic::aarch64_neon_ld4lane:
24727 case Intrinsic::aarch64_neon_ld2r:
24728 case Intrinsic::aarch64_neon_ld3r:
24729 case Intrinsic::aarch64_neon_ld4r:
24730 case Intrinsic::aarch64_neon_st2:
24731 case Intrinsic::aarch64_neon_st3:
24732 case Intrinsic::aarch64_neon_st4:
24733 case Intrinsic::aarch64_neon_st1x2:
24734 case Intrinsic::aarch64_neon_st1x3:
24735 case Intrinsic::aarch64_neon_st1x4:
24736 case Intrinsic::aarch64_neon_st2lane:
24737 case Intrinsic::aarch64_neon_st3lane:
24738 case Intrinsic::aarch64_neon_st4lane:
24739 return performNEONPostLDSTCombine(N, DCI, DAG);
24740 case Intrinsic::aarch64_sve_ldnt1:
24741 return performLDNT1Combine(N, DAG);
24742 case Intrinsic::aarch64_sve_ld1rq:
24743 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24744 case Intrinsic::aarch64_sve_ld1ro:
24745 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24746 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24748 case Intrinsic::aarch64_sve_ldnt1_gather:
24750 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24751 return performGatherLoadCombine(N, DAG,
24753 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24755 case Intrinsic::aarch64_sve_ld1:
24757 case Intrinsic::aarch64_sve_ldnf1:
24759 case Intrinsic::aarch64_sve_ldff1:
24761 case Intrinsic::aarch64_sve_st1:
24762 return performST1Combine(N, DAG);
24763 case Intrinsic::aarch64_sve_stnt1:
24764 return performSTNT1Combine(N, DAG);
24765 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24767 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24769 case Intrinsic::aarch64_sve_stnt1_scatter:
24771 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24773 case Intrinsic::aarch64_sve_ld1_gather:
24775 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24776 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24778 case Intrinsic::aarch64_sve_ld1q_gather_index:
24779 return performGatherLoadCombine(N, DAG,
24781 case Intrinsic::aarch64_sve_ld1_gather_index:
24782 return performGatherLoadCombine(N, DAG,
24784 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24786 /*OnlyPackedOffsets=*/false);
24787 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24789 /*OnlyPackedOffsets=*/false);
24790 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24791 return performGatherLoadCombine(N, DAG,
24793 /*OnlyPackedOffsets=*/false);
24794 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24795 return performGatherLoadCombine(N, DAG,
24797 /*OnlyPackedOffsets=*/false);
24798 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24800 case Intrinsic::aarch64_sve_ldff1_gather:
24802 case Intrinsic::aarch64_sve_ldff1_gather_index:
24803 return performGatherLoadCombine(N, DAG,
24805 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24806 return performGatherLoadCombine(N, DAG,
24808 /*OnlyPackedOffsets=*/false);
24809 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24810 return performGatherLoadCombine(N, DAG,
24812 /*OnlyPackedOffsets=*/false);
24813 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24814 return performGatherLoadCombine(N, DAG,
24816 /*OnlyPackedOffsets=*/false);
24817 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24818 return performGatherLoadCombine(N, DAG,
24820 /*OnlyPackedOffsets=*/false);
24821 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24822 return performGatherLoadCombine(N, DAG,
24824 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24825 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24827 case Intrinsic::aarch64_sve_st1q_scatter_index:
24829 case Intrinsic::aarch64_sve_st1_scatter:
24831 case Intrinsic::aarch64_sve_st1_scatter_index:
24833 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24835 /*OnlyPackedOffsets=*/false);
24836 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24838 /*OnlyPackedOffsets=*/false);
24839 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24840 return performScatterStoreCombine(N, DAG,
24842 /*OnlyPackedOffsets=*/false);
24843 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24844 return performScatterStoreCombine(N, DAG,
24846 /*OnlyPackedOffsets=*/false);
24847 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24849 case Intrinsic::aarch64_rndr:
24850 case Intrinsic::aarch64_rndrrs: {
24851 unsigned IntrinsicID = N->getConstantOperandVal(1);
24852 auto Register =
24853 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24854 : AArch64SysReg::RNDRRS);
24855 SDLoc DL(N);
24856 SDValue A = DAG.getNode(
24857 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24858 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24859 SDValue B = DAG.getNode(
24860 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24861 DAG.getConstant(0, DL, MVT::i32),
24862 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24863 return DAG.getMergeValues(
24864 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24865 }
24866 case Intrinsic::aarch64_sme_ldr_zt:
24868 DAG.getVTList(MVT::Other), N->getOperand(0),
24869 N->getOperand(2), N->getOperand(3));
24870 case Intrinsic::aarch64_sme_str_zt:
24871 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24872 DAG.getVTList(MVT::Other), N->getOperand(0),
24873 N->getOperand(2), N->getOperand(3));
24874 default:
24875 break;
24876 }
24877 break;
24878 case ISD::GlobalAddress:
24879 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24880 case ISD::CTLZ:
24881 return performCTLZCombine(N, DAG, Subtarget);
24883 return performScalarToVectorCombine(N, DCI, DAG);
24884 }
24885 return SDValue();
24886}
24887
24888// Check if the return value is used as only a return value, as otherwise
24889// we can't perform a tail-call. In particular, we need to check for
24890// target ISD nodes that are returns and any other "odd" constructs
24891// that the generic analysis code won't necessarily catch.
24892bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24893 SDValue &Chain) const {
24894 if (N->getNumValues() != 1)
24895 return false;
24896 if (!N->hasNUsesOfValue(1, 0))
24897 return false;
24898
24899 SDValue TCChain = Chain;
24900 SDNode *Copy = *N->use_begin();
24901 if (Copy->getOpcode() == ISD::CopyToReg) {
24902 // If the copy has a glue operand, we conservatively assume it isn't safe to
24903 // perform a tail call.
24904 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24905 MVT::Glue)
24906 return false;
24907 TCChain = Copy->getOperand(0);
24908 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24909 return false;
24910
24911 bool HasRet = false;
24912 for (SDNode *Node : Copy->uses()) {
24913 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24914 return false;
24915 HasRet = true;
24916 }
24917
24918 if (!HasRet)
24919 return false;
24920
24921 Chain = TCChain;
24922 return true;
24923}
24924
24925// Return whether the an instruction can potentially be optimized to a tail
24926// call. This will cause the optimizers to attempt to move, or duplicate,
24927// return instructions to help enable tail call optimizations for this
24928// instruction.
24929bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24930 return CI->isTailCall();
24931}
24932
24933bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24934 Register Offset, bool IsPre,
24935 MachineRegisterInfo &MRI) const {
24936 auto CstOffset = getIConstantVRegVal(Offset, MRI);
24937 if (!CstOffset || CstOffset->isZero())
24938 return false;
24939
24940 // All of the indexed addressing mode instructions take a signed 9 bit
24941 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24942 // encodes the sign/indexing direction.
24943 return isInt<9>(CstOffset->getSExtValue());
24944}
24945
24946bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24947 SDValue &Base,
24948 SDValue &Offset,
24949 SelectionDAG &DAG) const {
24950 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24951 return false;
24952
24953 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24954 SDNode *ValOnlyUser = nullptr;
24955 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24956 ++UI) {
24957 if (UI.getUse().getResNo() == 1)
24958 continue; // Ignore chain.
24959 if (ValOnlyUser == nullptr)
24960 ValOnlyUser = *UI;
24961 else {
24962 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24963 break;
24964 }
24965 }
24966
24967 auto IsUndefOrZero = [](SDValue V) {
24968 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24969 };
24970
24971 // If the only user of the value is a scalable vector splat, it is
24972 // preferable to do a replicating load (ld1r*).
24973 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
24974 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24975 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24976 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
24977 return false;
24978
24979 Base = Op->getOperand(0);
24980 // All of the indexed addressing mode instructions take a signed
24981 // 9 bit immediate offset.
24982 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
24983 int64_t RHSC = RHS->getSExtValue();
24984 if (Op->getOpcode() == ISD::SUB)
24985 RHSC = -(uint64_t)RHSC;
24986 if (!isInt<9>(RHSC))
24987 return false;
24988 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24989 // when dealing with subtraction.
24990 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
24991 return true;
24992 }
24993 return false;
24994}
24995
24996bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24997 SDValue &Offset,
24999 SelectionDAG &DAG) const {
25000 EVT VT;
25001 SDValue Ptr;
25002 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25003 VT = LD->getMemoryVT();
25004 Ptr = LD->getBasePtr();
25005 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25006 VT = ST->getMemoryVT();
25007 Ptr = ST->getBasePtr();
25008 } else
25009 return false;
25010
25011 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25012 return false;
25013 AM = ISD::PRE_INC;
25014 return true;
25015}
25016
25017bool AArch64TargetLowering::getPostIndexedAddressParts(
25019 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25020 EVT VT;
25021 SDValue Ptr;
25022 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25023 VT = LD->getMemoryVT();
25024 Ptr = LD->getBasePtr();
25025 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25026 VT = ST->getMemoryVT();
25027 Ptr = ST->getBasePtr();
25028 } else
25029 return false;
25030
25031 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25032 return false;
25033 // Post-indexing updates the base, so it's not a valid transform
25034 // if that's not the same as the load's pointer.
25035 if (Ptr != Base)
25036 return false;
25037 AM = ISD::POST_INC;
25038 return true;
25039}
25040
25043 SelectionDAG &DAG) {
25044 SDLoc DL(N);
25045 SDValue Op = N->getOperand(0);
25046 EVT VT = N->getValueType(0);
25047 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25048 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25049 "Must be bool vector.");
25050
25051 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25052 // elements, it adds a vector concatenation with undef(s). If we encounter
25053 // this here, we can skip the concat.
25054 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25055 bool AllUndef = true;
25056 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25057 AllUndef &= Op.getOperand(I).isUndef();
25058
25059 if (AllUndef)
25060 Op = Op.getOperand(0);
25061 }
25062
25063 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25064 if (VectorBits)
25065 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25066}
25067
25070 SelectionDAG &DAG, EVT ExtendVT,
25071 EVT CastVT) {
25072 SDLoc DL(N);
25073 SDValue Op = N->getOperand(0);
25074 EVT VT = N->getValueType(0);
25075
25076 // Use SCALAR_TO_VECTOR for lane zero
25077 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25078 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25079 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25080 Results.push_back(
25081 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25082}
25083
25084void AArch64TargetLowering::ReplaceBITCASTResults(
25086 SDLoc DL(N);
25087 SDValue Op = N->getOperand(0);
25088 EVT VT = N->getValueType(0);
25089 EVT SrcVT = Op.getValueType();
25090
25091 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25092 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25093 return;
25094 }
25095
25096 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25097 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25098 return;
25099 }
25100
25101 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25102 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25103 return;
25104 }
25105
25106 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25107 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25108 "Expected fp->int bitcast!");
25109
25110 // Bitcasting between unpacked vector types of different element counts is
25111 // not a NOP because the live elements are laid out differently.
25112 // 01234567
25113 // e.g. nxv2i32 = XX??XX??
25114 // nxv4f16 = X?X?X?X?
25115 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25116 return;
25117
25118 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25119 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25120 return;
25121 }
25122
25123 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25124 !VT.isVector())
25125 return replaceBoolVectorBitcast(N, Results, DAG);
25126
25127 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25128 return;
25129
25130 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25131 DAG.getUNDEF(MVT::i32), Op);
25132 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25133 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25134}
25135
25137 SelectionDAG &DAG,
25138 const AArch64Subtarget *Subtarget) {
25139 EVT VT = N->getValueType(0);
25140 if (!VT.is256BitVector() ||
25142 !N->getFlags().hasAllowReassociation()) ||
25143 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25144 VT.getScalarType() == MVT::bf16)
25145 return;
25146
25147 SDValue X = N->getOperand(0);
25148 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25149 if (!Shuf) {
25150 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25151 X = N->getOperand(1);
25152 if (!Shuf)
25153 return;
25154 }
25155
25156 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25157 return;
25158
25159 // Check the mask is 1,0,3,2,5,4,...
25160 ArrayRef<int> Mask = Shuf->getMask();
25161 for (int I = 0, E = Mask.size(); I < E; I++)
25162 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25163 return;
25164
25165 SDLoc DL(N);
25166 auto LoHi = DAG.SplitVector(X, DL);
25167 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25168 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25169 LoHi.first, LoHi.second);
25170
25171 // Shuffle the elements back into order.
25172 SmallVector<int> NMask;
25173 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25174 NMask.push_back(I);
25175 NMask.push_back(I);
25176 }
25177 Results.push_back(
25178 DAG.getVectorShuffle(VT, DL,
25179 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25180 DAG.getUNDEF(LoHi.first.getValueType())),
25181 DAG.getUNDEF(VT), NMask));
25182}
25183
25186 SelectionDAG &DAG, unsigned InterOp,
25187 unsigned AcrossOp) {
25188 EVT LoVT, HiVT;
25189 SDValue Lo, Hi;
25190 SDLoc dl(N);
25191 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25192 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25193 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25194 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25195 Results.push_back(SplitVal);
25196}
25197
25198void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25200 SDValue In = N->getOperand(0);
25201 EVT InVT = In.getValueType();
25202
25203 // Common code will handle these just fine.
25204 if (!InVT.isScalableVector() || !InVT.isInteger())
25205 return;
25206
25207 SDLoc DL(N);
25208 EVT VT = N->getValueType(0);
25209
25210 // The following checks bail if this is not a halving operation.
25211
25213
25214 if (InVT.getVectorElementCount() != (ResEC * 2))
25215 return;
25216
25217 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25218 if (!CIndex)
25219 return;
25220
25221 unsigned Index = CIndex->getZExtValue();
25222 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25223 return;
25224
25225 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25226 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25227
25228 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25229 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25230}
25231
25232// Create an even/odd pair of X registers holding integer value V.
25234 SDLoc dl(V.getNode());
25235 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25236 if (DAG.getDataLayout().isBigEndian())
25237 std::swap (VLo, VHi);
25238 SDValue RegClass =
25239 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25240 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25241 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25242 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25243 return SDValue(
25244 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25245}
25246
25249 SelectionDAG &DAG,
25250 const AArch64Subtarget *Subtarget) {
25251 assert(N->getValueType(0) == MVT::i128 &&
25252 "AtomicCmpSwap on types less than 128 should be legal");
25253
25254 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25255 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25256 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25257 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25258 SDValue Ops[] = {
25259 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25260 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25261 N->getOperand(1), // Ptr
25262 N->getOperand(0), // Chain in
25263 };
25264
25265 unsigned Opcode;
25266 switch (MemOp->getMergedOrdering()) {
25268 Opcode = AArch64::CASPX;
25269 break;
25271 Opcode = AArch64::CASPAX;
25272 break;
25274 Opcode = AArch64::CASPLX;
25275 break;
25278 Opcode = AArch64::CASPALX;
25279 break;
25280 default:
25281 llvm_unreachable("Unexpected ordering!");
25282 }
25283
25284 MachineSDNode *CmpSwap = DAG.getMachineNode(
25285 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25286 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25287
25288 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25289 if (DAG.getDataLayout().isBigEndian())
25290 std::swap(SubReg1, SubReg2);
25291 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25292 SDValue(CmpSwap, 0));
25293 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25294 SDValue(CmpSwap, 0));
25295 Results.push_back(
25296 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25297 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25298 return;
25299 }
25300
25301 unsigned Opcode;
25302 switch (MemOp->getMergedOrdering()) {
25304 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25305 break;
25307 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25308 break;
25310 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25311 break;
25314 Opcode = AArch64::CMP_SWAP_128;
25315 break;
25316 default:
25317 llvm_unreachable("Unexpected ordering!");
25318 }
25319
25320 SDLoc DL(N);
25321 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25322 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25323 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25324 New.first, New.second, N->getOperand(0)};
25325 SDNode *CmpSwap = DAG.getMachineNode(
25326 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25327 Ops);
25328 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25329
25330 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25331 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25332 Results.push_back(SDValue(CmpSwap, 3));
25333}
25334
25335static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25336 AtomicOrdering Ordering) {
25337 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25338 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25339 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25340 // ATOMIC_LOAD_CLR at any point.
25341 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25342 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25343 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25344 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25345
25346 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25347 // The operand will need to be XORed in a separate step.
25348 switch (Ordering) {
25350 return AArch64::LDCLRP;
25351 break;
25353 return AArch64::LDCLRPA;
25354 break;
25356 return AArch64::LDCLRPL;
25357 break;
25360 return AArch64::LDCLRPAL;
25361 break;
25362 default:
25363 llvm_unreachable("Unexpected ordering!");
25364 }
25365 }
25366
25367 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25368 switch (Ordering) {
25370 return AArch64::LDSETP;
25371 break;
25373 return AArch64::LDSETPA;
25374 break;
25376 return AArch64::LDSETPL;
25377 break;
25380 return AArch64::LDSETPAL;
25381 break;
25382 default:
25383 llvm_unreachable("Unexpected ordering!");
25384 }
25385 }
25386
25387 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25388 switch (Ordering) {
25390 return AArch64::SWPP;
25391 break;
25393 return AArch64::SWPPA;
25394 break;
25396 return AArch64::SWPPL;
25397 break;
25400 return AArch64::SWPPAL;
25401 break;
25402 default:
25403 llvm_unreachable("Unexpected ordering!");
25404 }
25405 }
25406
25407 llvm_unreachable("Unexpected ISDOpcode!");
25408}
25409
25412 SelectionDAG &DAG,
25413 const AArch64Subtarget *Subtarget) {
25414 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25415 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25416 // rather than the CASP instructions, because CASP has register classes for
25417 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25418 // to present them as single operands. LSE128 instructions use the GPR64
25419 // register class (because the pair does not have to be sequential), like
25420 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25421
25422 assert(N->getValueType(0) == MVT::i128 &&
25423 "AtomicLoadXXX on types less than 128 should be legal");
25424
25425 if (!Subtarget->hasLSE128())
25426 return;
25427
25428 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25429 const SDValue &Chain = N->getOperand(0);
25430 const SDValue &Ptr = N->getOperand(1);
25431 const SDValue &Val128 = N->getOperand(2);
25432 std::pair<SDValue, SDValue> Val2x64 =
25433 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25434
25435 const unsigned ISDOpcode = N->getOpcode();
25436 const unsigned MachineOpcode =
25437 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25438
25439 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25440 SDLoc dl(Val128);
25441 Val2x64.first =
25442 DAG.getNode(ISD::XOR, dl, MVT::i64,
25443 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25444 Val2x64.second =
25445 DAG.getNode(ISD::XOR, dl, MVT::i64,
25446 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25447 }
25448
25449 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25450 if (DAG.getDataLayout().isBigEndian())
25451 std::swap(Ops[0], Ops[1]);
25452
25453 MachineSDNode *AtomicInst =
25454 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25455 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25456
25457 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25458
25459 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25460 if (DAG.getDataLayout().isBigEndian())
25461 std::swap(Lo, Hi);
25462
25463 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25464 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25465}
25466
25467void AArch64TargetLowering::ReplaceNodeResults(
25469 switch (N->getOpcode()) {
25470 default:
25471 llvm_unreachable("Don't know how to custom expand this");
25472 case ISD::BITCAST:
25473 ReplaceBITCASTResults(N, Results, DAG);
25474 return;
25475 case ISD::VECREDUCE_ADD:
25480 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25481 return;
25482 case ISD::ADD:
25483 case ISD::FADD:
25484 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25485 return;
25486
25487 case ISD::CTPOP:
25488 case ISD::PARITY:
25489 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25490 Results.push_back(Result);
25491 return;
25492 case AArch64ISD::SADDV:
25494 return;
25495 case AArch64ISD::UADDV:
25497 return;
25498 case AArch64ISD::SMINV:
25500 return;
25501 case AArch64ISD::UMINV:
25503 return;
25504 case AArch64ISD::SMAXV:
25506 return;
25507 case AArch64ISD::UMAXV:
25509 return;
25510 case ISD::MULHS:
25512 Results.push_back(
25513 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25514 return;
25515 case ISD::MULHU:
25517 Results.push_back(
25518 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25519 return;
25520 case ISD::FP_TO_UINT:
25521 case ISD::FP_TO_SINT:
25524 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25525 // Let normal code take care of it by not adding anything to Results.
25526 return;
25528 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25529 return;
25531 assert(N->getValueType(0) != MVT::i128 &&
25532 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25533 break;
25536 case ISD::ATOMIC_SWAP: {
25537 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25538 "Expected 128-bit atomicrmw.");
25539 // These need custom type legalisation so we go directly to instruction.
25540 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25541 return;
25542 }
25543 case ISD::ATOMIC_LOAD:
25544 case ISD::LOAD: {
25545 MemSDNode *LoadNode = cast<MemSDNode>(N);
25546 EVT MemVT = LoadNode->getMemoryVT();
25547 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25548 // targets.
25549 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25550 MemVT.getSizeInBits() == 256u &&
25551 (MemVT.getScalarSizeInBits() == 8u ||
25552 MemVT.getScalarSizeInBits() == 16u ||
25553 MemVT.getScalarSizeInBits() == 32u ||
25554 MemVT.getScalarSizeInBits() == 64u)) {
25555
25558 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25559 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25560 MVT::Other}),
25561 {LoadNode->getChain(), LoadNode->getBasePtr()},
25562 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25563
25564 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25565 Result.getValue(0), Result.getValue(1));
25566 Results.append({Pair, Result.getValue(2) /* Chain */});
25567 return;
25568 }
25569
25570 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25571 LoadNode->getMemoryVT() != MVT::i128) {
25572 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25573 // optimizer.
25574 return;
25575 }
25576
25577 if (SDValue(N, 0).getValueType() == MVT::i128) {
25578 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25579 bool isLoadAcquire =
25581 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25582
25583 if (isLoadAcquire)
25584 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25585
25587 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25588 {LoadNode->getChain(), LoadNode->getBasePtr()},
25589 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25590
25591 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25592
25593 SDValue Pair =
25594 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25595 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25596 Results.append({Pair, Result.getValue(2) /* Chain */});
25597 }
25598 return;
25599 }
25601 ReplaceExtractSubVectorResults(N, Results, DAG);
25602 return;
25605 // Custom lowering has been requested for INSERT_SUBVECTOR and
25606 // CONCAT_VECTORS -- but delegate to common code for result type
25607 // legalisation
25608 return;
25610 EVT VT = N->getValueType(0);
25611 assert((VT == MVT::i8 || VT == MVT::i16) &&
25612 "custom lowering for unexpected type");
25613
25614 Intrinsic::ID IntID =
25615 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25616 switch (IntID) {
25617 default:
25618 return;
25619 case Intrinsic::aarch64_sve_clasta_n: {
25620 SDLoc DL(N);
25621 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25622 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25623 N->getOperand(1), Op2, N->getOperand(3));
25624 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25625 return;
25626 }
25627 case Intrinsic::aarch64_sve_clastb_n: {
25628 SDLoc DL(N);
25629 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25630 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25631 N->getOperand(1), Op2, N->getOperand(3));
25632 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25633 return;
25634 }
25635 case Intrinsic::aarch64_sve_lasta: {
25636 SDLoc DL(N);
25637 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25638 N->getOperand(1), N->getOperand(2));
25639 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25640 return;
25641 }
25642 case Intrinsic::aarch64_sve_lastb: {
25643 SDLoc DL(N);
25644 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25645 N->getOperand(1), N->getOperand(2));
25646 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25647 return;
25648 }
25649 }
25650 }
25651 case ISD::READ_REGISTER: {
25652 SDLoc DL(N);
25653 assert(N->getValueType(0) == MVT::i128 &&
25654 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25655 SDValue Chain = N->getOperand(0);
25656 SDValue SysRegName = N->getOperand(1);
25657
25658 SDValue Result = DAG.getNode(
25659 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25660 Chain, SysRegName);
25661
25662 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25663 // of the 128-bit System Register value.
25664 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25665 Result.getValue(0), Result.getValue(1));
25666 Results.push_back(Pair);
25667 Results.push_back(Result.getValue(2)); // Chain
25668 return;
25669 }
25670 }
25671}
25672
25674 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25676 return true;
25677}
25678
25679unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25680 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25681 // reciprocal if there are three or more FDIVs.
25682 return 3;
25683}
25684
25687 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25688 // v4i16, v2i32 instead of to promote.
25689 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25690 VT == MVT::v1f32)
25691 return TypeWidenVector;
25692
25694}
25695
25696// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25697// provided the address is 16-byte aligned.
25699 if (!Subtarget->hasLSE2())
25700 return false;
25701
25702 if (auto LI = dyn_cast<LoadInst>(I))
25703 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25704 LI->getAlign() >= Align(16);
25705
25706 if (auto SI = dyn_cast<StoreInst>(I))
25707 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25708 SI->getAlign() >= Align(16);
25709
25710 return false;
25711}
25712
25714 if (!Subtarget->hasLSE128())
25715 return false;
25716
25717 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25718 // will clobber the two registers.
25719 if (const auto *SI = dyn_cast<StoreInst>(I))
25720 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25721 SI->getAlign() >= Align(16) &&
25722 (SI->getOrdering() == AtomicOrdering::Release ||
25723 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25724
25725 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25726 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25727 RMW->getAlign() >= Align(16) &&
25728 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25729 RMW->getOperation() == AtomicRMWInst::And ||
25730 RMW->getOperation() == AtomicRMWInst::Or);
25731
25732 return false;
25733}
25734
25736 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25737 return false;
25738
25739 if (auto LI = dyn_cast<LoadInst>(I))
25740 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25741 LI->getAlign() >= Align(16) &&
25742 LI->getOrdering() == AtomicOrdering::Acquire;
25743
25744 if (auto SI = dyn_cast<StoreInst>(I))
25745 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25746 SI->getAlign() >= Align(16) &&
25747 SI->getOrdering() == AtomicOrdering::Release;
25748
25749 return false;
25750}
25751
25753 const Instruction *I) const {
25755 return false;
25757 return false;
25759 return true;
25760 return false;
25761}
25762
25764 const Instruction *I) const {
25765 // Store-Release instructions only provide seq_cst guarantees when paired with
25766 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25767 // implement seq_cst loads and stores, so we need additional explicit fences
25768 // after memory writes.
25769 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25770 return false;
25771
25772 switch (I->getOpcode()) {
25773 default:
25774 return false;
25775 case Instruction::AtomicCmpXchg:
25776 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25778 case Instruction::AtomicRMW:
25779 return cast<AtomicRMWInst>(I)->getOrdering() ==
25781 case Instruction::Store:
25782 return cast<StoreInst>(I)->getOrdering() ==
25784 }
25785}
25786
25787// Loads and stores less than 128-bits are already atomic; ones above that
25788// are doomed anyway, so defer to the default libcall and blame the OS when
25789// things go wrong.
25792 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25793 if (Size != 128)
25795 if (isOpSuitableForRCPC3(SI))
25797 if (isOpSuitableForLSE128(SI))
25799 if (isOpSuitableForLDPSTP(SI))
25802}
25803
25804// Loads and stores less than 128-bits are already atomic; ones above that
25805// are doomed anyway, so defer to the default libcall and blame the OS when
25806// things go wrong.
25809 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25810
25811 if (Size != 128)
25813 if (isOpSuitableForRCPC3(LI))
25815 // No LSE128 loads
25816 if (isOpSuitableForLDPSTP(LI))
25818
25819 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25820 // implement atomicrmw without spilling. If the target address is also on the
25821 // stack and close enough to the spill slot, this can lead to a situation
25822 // where the monitor always gets cleared and the atomic operation can never
25823 // succeed. So at -O0 lower this operation to a CAS loop.
25824 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25826
25827 // Using CAS for an atomic load has a better chance of succeeding under high
25828 // contention situations. So use it if available.
25829 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25831}
25832
25833// The "default" for integer RMW operations is to expand to an LL/SC loop.
25834// However, with the LSE instructions (or outline-atomics mode, which provides
25835// library routines in place of the LSE-instructions), we can directly emit many
25836// operations instead.
25837//
25838// Floating-point operations are always emitted to a cmpxchg loop, because they
25839// may trigger a trap which aborts an LLSC sequence.
25842 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25843 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25844
25845 if (AI->isFloatingPointOperation())
25847
25848 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25852 if (CanUseLSE128)
25854
25855 // Nand is not supported in LSE.
25856 // Leave 128 bits to LLSC or CmpXChg.
25857 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25858 if (Subtarget->hasLSE())
25860 if (Subtarget->outlineAtomics()) {
25861 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25862 // Don't outline them unless
25863 // (1) high level <atomic> support approved:
25864 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25865 // (2) low level libgcc and compiler-rt support implemented by:
25866 // min/max outline atomics helpers
25867 if (AI->getOperation() != AtomicRMWInst::Min &&
25872 }
25873 }
25874 }
25875
25876 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25877 // implement atomicrmw without spilling. If the target address is also on the
25878 // stack and close enough to the spill slot, this can lead to a situation
25879 // where the monitor always gets cleared and the atomic operation can never
25880 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25881 // we have a single CAS instruction that can replace the loop.
25883 Subtarget->hasLSE())
25885
25887}
25888
25891 AtomicCmpXchgInst *AI) const {
25892 // If subtarget has LSE, leave cmpxchg intact for codegen.
25893 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25895 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25896 // implement cmpxchg without spilling. If the address being exchanged is also
25897 // on the stack and close enough to the spill slot, this can lead to a
25898 // situation where the monitor always gets cleared and the atomic operation
25899 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25900 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25902
25903 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25904 // it.
25906 if (Size > 64)
25908
25910}
25911
25913 Type *ValueTy, Value *Addr,
25914 AtomicOrdering Ord) const {
25915 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25916 bool IsAcquire = isAcquireOrStronger(Ord);
25917
25918 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25919 // intrinsic must return {i64, i64} and we have to recombine them into a
25920 // single i128 here.
25921 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25923 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25925
25926 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25927
25928 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25929 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25930 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25931 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25932 return Builder.CreateOr(
25933 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25934 }
25935
25936 Type *Tys[] = { Addr->getType() };
25938 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25939 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25940
25941 const DataLayout &DL = M->getDataLayout();
25942 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25943 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25944 CI->addParamAttr(
25945 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25946 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25947
25948 return Builder.CreateBitCast(Trunc, ValueTy);
25949}
25950
25952 IRBuilderBase &Builder) const {
25953 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25954 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25955}
25956
25958 Value *Val, Value *Addr,
25959 AtomicOrdering Ord) const {
25960 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25961 bool IsRelease = isReleaseOrStronger(Ord);
25962
25963 // Since the intrinsics must have legal type, the i128 intrinsics take two
25964 // parameters: "i64, i64". We must marshal Val into the appropriate form
25965 // before the call.
25966 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25968 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25970 Type *Int64Ty = Type::getInt64Ty(M->getContext());
25971
25972 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
25973 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
25974 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
25975 }
25976
25978 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25979 Type *Tys[] = { Addr->getType() };
25980 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
25981
25982 const DataLayout &DL = M->getDataLayout();
25983 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
25984 Val = Builder.CreateBitCast(Val, IntValTy);
25985
25986 CallInst *CI = Builder.CreateCall(
25987 Stxr, {Builder.CreateZExtOrBitCast(
25988 Val, Stxr->getFunctionType()->getParamType(0)),
25989 Addr});
25990 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
25991 Attribute::ElementType, Val->getType()));
25992 return CI;
25993}
25994
25996 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
25997 const DataLayout &DL) const {
25998 if (!Ty->isArrayTy()) {
25999 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26000 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26001 }
26002
26003 // All non aggregate members of the type must have the same type
26004 SmallVector<EVT> ValueVTs;
26005 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26006 return all_equal(ValueVTs);
26007}
26008
26009bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26010 EVT) const {
26011 return false;
26012}
26013
26014static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26015 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26016 Function *ThreadPointerFunc =
26017 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26018 return IRB.CreatePointerCast(
26019 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26020 Offset),
26021 IRB.getPtrTy(0));
26022}
26023
26025 // Android provides a fixed TLS slot for the stack cookie. See the definition
26026 // of TLS_SLOT_STACK_GUARD in
26027 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26028 if (Subtarget->isTargetAndroid())
26029 return UseTlsOffset(IRB, 0x28);
26030
26031 // Fuchsia is similar.
26032 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26033 if (Subtarget->isTargetFuchsia())
26034 return UseTlsOffset(IRB, -0x10);
26035
26037}
26038
26040 // MSVC CRT provides functionalities for stack protection.
26041 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26042 // MSVC CRT has a global variable holding security cookie.
26043 M.getOrInsertGlobal("__security_cookie",
26044 PointerType::getUnqual(M.getContext()));
26045
26046 // MSVC CRT has a function to validate security cookie.
26047 FunctionCallee SecurityCheckCookie =
26048 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26049 Type::getVoidTy(M.getContext()),
26050 PointerType::getUnqual(M.getContext()));
26051 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26052 F->setCallingConv(CallingConv::Win64);
26053 F->addParamAttr(0, Attribute::AttrKind::InReg);
26054 }
26055 return;
26056 }
26058}
26059
26061 // MSVC CRT has a global variable holding security cookie.
26062 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26063 return M.getGlobalVariable("__security_cookie");
26065}
26066
26068 // MSVC CRT has a function to validate security cookie.
26069 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26070 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26072}
26073
26074Value *
26076 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26077 // definition of TLS_SLOT_SAFESTACK in
26078 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26079 if (Subtarget->isTargetAndroid())
26080 return UseTlsOffset(IRB, 0x48);
26081
26082 // Fuchsia is similar.
26083 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26084 if (Subtarget->isTargetFuchsia())
26085 return UseTlsOffset(IRB, -0x8);
26086
26088}
26089
26091 const Instruction &AndI) const {
26092 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26093 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26094 // may be beneficial to sink in other cases, but we would have to check that
26095 // the cmp would not get folded into the br to form a cbz for these to be
26096 // beneficial.
26097 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26098 if (!Mask)
26099 return false;
26100 return Mask->getValue().isPowerOf2();
26101}
26102
26106 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26107 SelectionDAG &DAG) const {
26108 // Does baseline recommend not to perform the fold by default?
26110 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26111 return false;
26112 // Else, if this is a vector shift, prefer 'shl'.
26113 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26114}
26115
26118 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26120 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26123 ExpansionFactor);
26124}
26125
26127 // Update IsSplitCSR in AArch64unctionInfo.
26128 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26129 AFI->setIsSplitCSR(true);
26130}
26131
26133 MachineBasicBlock *Entry,
26134 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26135 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26136 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26137 if (!IStart)
26138 return;
26139
26140 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26141 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26142 MachineBasicBlock::iterator MBBI = Entry->begin();
26143 for (const MCPhysReg *I = IStart; *I; ++I) {
26144 const TargetRegisterClass *RC = nullptr;
26145 if (AArch64::GPR64RegClass.contains(*I))
26146 RC = &AArch64::GPR64RegClass;
26147 else if (AArch64::FPR64RegClass.contains(*I))
26148 RC = &AArch64::FPR64RegClass;
26149 else
26150 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26151
26152 Register NewVR = MRI->createVirtualRegister(RC);
26153 // Create copy from CSR to a virtual register.
26154 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26155 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26156 // nounwind. If we want to generalize this later, we may need to emit
26157 // CFI pseudo-instructions.
26158 assert(Entry->getParent()->getFunction().hasFnAttribute(
26159 Attribute::NoUnwind) &&
26160 "Function should be nounwind in insertCopiesSplitCSR!");
26161 Entry->addLiveIn(*I);
26162 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26163 .addReg(*I);
26164
26165 // Insert the copy-back instructions right before the terminator.
26166 for (auto *Exit : Exits)
26167 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26168 TII->get(TargetOpcode::COPY), *I)
26169 .addReg(NewVR);
26170 }
26171}
26172
26174 // Integer division on AArch64 is expensive. However, when aggressively
26175 // optimizing for code size, we prefer to use a div instruction, as it is
26176 // usually smaller than the alternative sequence.
26177 // The exception to this is vector division. Since AArch64 doesn't have vector
26178 // integer division, leaving the division as-is is a loss even in terms of
26179 // size, because it will have to be scalarized, while the alternative code
26180 // sequence can be performed in vector form.
26181 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26182 return OptSize && !VT.isVector();
26183}
26184
26186 // We want inc-of-add for scalars and sub-of-not for vectors.
26187 return VT.isScalarInteger();
26188}
26189
26191 EVT VT) const {
26192 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26193 // legalize.
26194 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26195 return false;
26196 if (FPVT == MVT::v8bf16)
26197 return false;
26198 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26199}
26200
26204 const TargetInstrInfo *TII) const {
26205 assert(MBBI->isCall() && MBBI->getCFIType() &&
26206 "Invalid call instruction for a KCFI check");
26207
26208 switch (MBBI->getOpcode()) {
26209 case AArch64::BLR:
26210 case AArch64::BLRNoIP:
26211 case AArch64::TCRETURNri:
26212 case AArch64::TCRETURNrix16x17:
26213 case AArch64::TCRETURNrix17:
26214 case AArch64::TCRETURNrinotx16:
26215 break;
26216 default:
26217 llvm_unreachable("Unexpected CFI call opcode");
26218 }
26219
26220 MachineOperand &Target = MBBI->getOperand(0);
26221 assert(Target.isReg() && "Invalid target operand for an indirect call");
26222 Target.setIsRenamable(false);
26223
26224 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26225 .addReg(Target.getReg())
26226 .addImm(MBBI->getCFIType())
26227 .getInstr();
26228}
26229
26231 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26232}
26233
26234unsigned
26236 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26237 return getPointerTy(DL).getSizeInBits();
26238
26239 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26240}
26241
26242void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26243 MachineFrameInfo &MFI = MF.getFrameInfo();
26244 // If we have any vulnerable SVE stack objects then the stack protector
26245 // needs to be placed at the top of the SVE stack area, as the SVE locals
26246 // are placed above the other locals, so we allocate it as if it were a
26247 // scalable vector.
26248 // FIXME: It may be worthwhile having a specific interface for this rather
26249 // than doing it here in finalizeLowering.
26250 if (MFI.hasStackProtectorIndex()) {
26251 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26257 break;
26258 }
26259 }
26260 }
26263}
26264
26265// Unlike X86, we let frame lowering assign offsets to all catch objects.
26267 return false;
26268}
26269
26270bool AArch64TargetLowering::shouldLocalize(
26271 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26272 auto &MF = *MI.getMF();
26273 auto &MRI = MF.getRegInfo();
26274 auto maxUses = [](unsigned RematCost) {
26275 // A cost of 1 means remats are basically free.
26276 if (RematCost == 1)
26277 return std::numeric_limits<unsigned>::max();
26278 if (RematCost == 2)
26279 return 2U;
26280
26281 // Remat is too expensive, only sink if there's one user.
26282 if (RematCost > 2)
26283 return 1U;
26284 llvm_unreachable("Unexpected remat cost");
26285 };
26286
26287 unsigned Opc = MI.getOpcode();
26288 switch (Opc) {
26289 case TargetOpcode::G_GLOBAL_VALUE: {
26290 // On Darwin, TLS global vars get selected into function calls, which
26291 // we don't want localized, as they can get moved into the middle of a
26292 // another call sequence.
26293 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26294 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26295 return false;
26296 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26297 }
26298 case TargetOpcode::G_FCONSTANT:
26299 case TargetOpcode::G_CONSTANT: {
26300 const ConstantInt *CI;
26301 unsigned AdditionalCost = 0;
26302
26303 if (Opc == TargetOpcode::G_CONSTANT)
26304 CI = MI.getOperand(1).getCImm();
26305 else {
26306 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26307 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26308 // materialized as integers.
26309 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26310 break;
26311 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26312 bool OptForSize =
26315 OptForSize))
26316 return true; // Constant should be cheap.
26317 CI =
26318 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26319 // FP materialization also costs an extra move, from gpr to fpr.
26320 AdditionalCost = 1;
26321 }
26322 APInt Imm = CI->getValue();
26325 assert(Cost.isValid() && "Expected a valid imm cost");
26326
26327 unsigned RematCost = *Cost.getValue();
26328 RematCost += AdditionalCost;
26329 Register Reg = MI.getOperand(0).getReg();
26330 unsigned MaxUses = maxUses(RematCost);
26331 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26332 if (MaxUses == std::numeric_limits<unsigned>::max())
26333 --MaxUses;
26334 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26335 }
26336 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26337 // localizable.
26338 case AArch64::ADRP:
26339 case AArch64::G_ADD_LOW:
26340 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26341 case TargetOpcode::G_PTR_ADD:
26342 return true;
26343 default:
26344 break;
26345 }
26347}
26348
26350 if (Inst.getType()->isScalableTy())
26351 return true;
26352
26353 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26354 if (Inst.getOperand(i)->getType()->isScalableTy())
26355 return true;
26356
26357 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26358 if (AI->getAllocatedType()->isScalableTy())
26359 return true;
26360 }
26361
26362 // Checks to allow the use of SME instructions
26363 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26364 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26365 auto CalleeAttrs = SMEAttrs(*Base);
26366 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26367 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26368 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26369 return true;
26370 }
26371 return false;
26372}
26373
26374// Return the largest legal scalable vector type that matches VT's element type.
26378 "Expected legal fixed length vector!");
26379 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26380 default:
26381 llvm_unreachable("unexpected element type for SVE container");
26382 case MVT::i8:
26383 return EVT(MVT::nxv16i8);
26384 case MVT::i16:
26385 return EVT(MVT::nxv8i16);
26386 case MVT::i32:
26387 return EVT(MVT::nxv4i32);
26388 case MVT::i64:
26389 return EVT(MVT::nxv2i64);
26390 case MVT::bf16:
26391 return EVT(MVT::nxv8bf16);
26392 case MVT::f16:
26393 return EVT(MVT::nxv8f16);
26394 case MVT::f32:
26395 return EVT(MVT::nxv4f32);
26396 case MVT::f64:
26397 return EVT(MVT::nxv2f64);
26398 }
26399}
26400
26401// Return a PTRUE with active lanes corresponding to the extent of VT.
26403 EVT VT) {
26406 "Expected legal fixed length vector!");
26407
26408 std::optional<unsigned> PgPattern =
26410 assert(PgPattern && "Unexpected element count for SVE predicate");
26411
26412 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26413 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26414 // variants of instructions when available.
26415 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26416 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26417 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26418 if (MaxSVESize && MinSVESize == MaxSVESize &&
26419 MaxSVESize == VT.getSizeInBits())
26420 PgPattern = AArch64SVEPredPattern::all;
26421
26422 MVT MaskVT;
26423 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26424 default:
26425 llvm_unreachable("unexpected element type for SVE predicate");
26426 case MVT::i8:
26427 MaskVT = MVT::nxv16i1;
26428 break;
26429 case MVT::i16:
26430 case MVT::f16:
26431 case MVT::bf16:
26432 MaskVT = MVT::nxv8i1;
26433 break;
26434 case MVT::i32:
26435 case MVT::f32:
26436 MaskVT = MVT::nxv4i1;
26437 break;
26438 case MVT::i64:
26439 case MVT::f64:
26440 MaskVT = MVT::nxv2i1;
26441 break;
26442 }
26443
26444 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26445}
26446
26448 EVT VT) {
26450 "Expected legal scalable vector!");
26451 auto PredTy = VT.changeVectorElementType(MVT::i1);
26452 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26453}
26454
26456 if (VT.isFixedLengthVector())
26457 return getPredicateForFixedLengthVector(DAG, DL, VT);
26458
26459 return getPredicateForScalableVector(DAG, DL, VT);
26460}
26461
26462// Grow V to consume an entire SVE register.
26464 assert(VT.isScalableVector() &&
26465 "Expected to convert into a scalable vector!");
26466 assert(V.getValueType().isFixedLengthVector() &&
26467 "Expected a fixed length vector operand!");
26468 SDLoc DL(V);
26469 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26470 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26471}
26472
26473// Shrink V so it's just big enough to maintain a VT's worth of data.
26476 "Expected to convert into a fixed length vector!");
26477 assert(V.getValueType().isScalableVector() &&
26478 "Expected a scalable vector operand!");
26479 SDLoc DL(V);
26480 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26481 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26482}
26483
26484// Convert all fixed length vector loads larger than NEON to masked_loads.
26485SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26486 SDValue Op, SelectionDAG &DAG) const {
26487 auto Load = cast<LoadSDNode>(Op);
26488
26489 SDLoc DL(Op);
26490 EVT VT = Op.getValueType();
26491 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26492 EVT LoadVT = ContainerVT;
26493 EVT MemVT = Load->getMemoryVT();
26494
26495 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26496
26497 if (VT.isFloatingPoint()) {
26498 LoadVT = ContainerVT.changeTypeToInteger();
26499 MemVT = MemVT.changeTypeToInteger();
26500 }
26501
26502 SDValue NewLoad = DAG.getMaskedLoad(
26503 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26504 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26505 Load->getAddressingMode(), Load->getExtensionType());
26506
26507 SDValue Result = NewLoad;
26508 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26509 EVT ExtendVT = ContainerVT.changeVectorElementType(
26510 Load->getMemoryVT().getVectorElementType());
26511
26512 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26514 Pg, Result, DAG.getUNDEF(ContainerVT));
26515 } else if (VT.isFloatingPoint()) {
26516 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26517 }
26518
26519 Result = convertFromScalableVector(DAG, VT, Result);
26520 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26521 return DAG.getMergeValues(MergedValues, DL);
26522}
26523
26525 SelectionDAG &DAG) {
26526 SDLoc DL(Mask);
26527 EVT InVT = Mask.getValueType();
26528 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26529
26530 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26531
26532 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26533 return Pg;
26534
26535 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26536 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26537
26539 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26540}
26541
26542// Convert all fixed length vector loads larger than NEON to masked_loads.
26543SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26544 SDValue Op, SelectionDAG &DAG) const {
26545 auto Load = cast<MaskedLoadSDNode>(Op);
26546
26547 SDLoc DL(Op);
26548 EVT VT = Op.getValueType();
26549 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26550
26551 SDValue Mask = Load->getMask();
26552 // If this is an extending load and the mask type is not the same as
26553 // load's type then we have to extend the mask type.
26554 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26555 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26556 "Incorrect mask type");
26557 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26558 }
26560
26561 SDValue PassThru;
26562 bool IsPassThruZeroOrUndef = false;
26563
26564 if (Load->getPassThru()->isUndef()) {
26565 PassThru = DAG.getUNDEF(ContainerVT);
26566 IsPassThruZeroOrUndef = true;
26567 } else {
26568 if (ContainerVT.isInteger())
26569 PassThru = DAG.getConstant(0, DL, ContainerVT);
26570 else
26571 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26572 if (isZerosVector(Load->getPassThru().getNode()))
26573 IsPassThruZeroOrUndef = true;
26574 }
26575
26576 SDValue NewLoad = DAG.getMaskedLoad(
26577 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26578 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26579 Load->getAddressingMode(), Load->getExtensionType());
26580
26581 SDValue Result = NewLoad;
26582 if (!IsPassThruZeroOrUndef) {
26583 SDValue OldPassThru =
26584 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26585 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26586 }
26587
26588 Result = convertFromScalableVector(DAG, VT, Result);
26589 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26590 return DAG.getMergeValues(MergedValues, DL);
26591}
26592
26593// Convert all fixed length vector stores larger than NEON to masked_stores.
26594SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26595 SDValue Op, SelectionDAG &DAG) const {
26596 auto Store = cast<StoreSDNode>(Op);
26597
26598 SDLoc DL(Op);
26599 EVT VT = Store->getValue().getValueType();
26600 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26601 EVT MemVT = Store->getMemoryVT();
26602
26603 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26604 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26605
26606 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26607 EVT TruncVT = ContainerVT.changeVectorElementType(
26608 Store->getMemoryVT().getVectorElementType());
26609 MemVT = MemVT.changeTypeToInteger();
26610 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26611 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26612 DAG.getUNDEF(TruncVT));
26613 NewValue =
26614 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26615 } else if (VT.isFloatingPoint()) {
26616 MemVT = MemVT.changeTypeToInteger();
26617 NewValue =
26618 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26619 }
26620
26621 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26622 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26623 Store->getMemOperand(), Store->getAddressingMode(),
26624 Store->isTruncatingStore());
26625}
26626
26627SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26628 SDValue Op, SelectionDAG &DAG) const {
26629 auto *Store = cast<MaskedStoreSDNode>(Op);
26630
26631 SDLoc DL(Op);
26632 EVT VT = Store->getValue().getValueType();
26633 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26634
26635 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26637
26638 return DAG.getMaskedStore(
26639 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26640 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26641 Store->getAddressingMode(), Store->isTruncatingStore());
26642}
26643
26644SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26645 SDValue Op, SelectionDAG &DAG) const {
26646 SDLoc dl(Op);
26647 EVT VT = Op.getValueType();
26648 EVT EltVT = VT.getVectorElementType();
26649
26650 bool Signed = Op.getOpcode() == ISD::SDIV;
26651 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26652
26653 bool Negated;
26654 uint64_t SplatVal;
26655 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26656 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26657 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26658 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26659
26660 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26661 SDValue Res =
26662 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26663 if (Negated)
26664 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26665 DAG.getConstant(0, dl, ContainerVT), Res);
26666
26667 return convertFromScalableVector(DAG, VT, Res);
26668 }
26669
26670 // Scalable vector i32/i64 DIV is supported.
26671 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26672 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26673
26674 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26675 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26676 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26677 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26678
26679 // If the wider type is legal: extend, op, and truncate.
26680 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26681 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26682 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26683 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26684 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26685 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26686 }
26687
26688 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26689 &ExtendOpcode](SDValue Op) {
26690 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26691 SDValue IdxHalf =
26692 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26693 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26694 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26695 return std::pair<SDValue, SDValue>(
26696 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26697 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26698 };
26699
26700 // If wider type is not legal: split, extend, op, trunc and concat.
26701 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26702 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26703 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26704 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26705 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26706 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26707 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26708}
26709
26710SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26711 SDValue Op, SelectionDAG &DAG) const {
26712 EVT VT = Op.getValueType();
26713 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26714
26715 SDLoc DL(Op);
26716 SDValue Val = Op.getOperand(0);
26717 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26718 Val = convertToScalableVector(DAG, ContainerVT, Val);
26719
26720 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26721 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26722
26723 // Repeatedly unpack Val until the result is of the desired element type.
26724 switch (ContainerVT.getSimpleVT().SimpleTy) {
26725 default:
26726 llvm_unreachable("unimplemented container type");
26727 case MVT::nxv16i8:
26728 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26729 if (VT.getVectorElementType() == MVT::i16)
26730 break;
26731 [[fallthrough]];
26732 case MVT::nxv8i16:
26733 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26734 if (VT.getVectorElementType() == MVT::i32)
26735 break;
26736 [[fallthrough]];
26737 case MVT::nxv4i32:
26738 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26739 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26740 break;
26741 }
26742
26743 return convertFromScalableVector(DAG, VT, Val);
26744}
26745
26746SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26747 SDValue Op, SelectionDAG &DAG) const {
26748 EVT VT = Op.getValueType();
26749 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26750
26751 SDLoc DL(Op);
26752 SDValue Val = Op.getOperand(0);
26753 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26754 Val = convertToScalableVector(DAG, ContainerVT, Val);
26755
26756 // Repeatedly truncate Val until the result is of the desired element type.
26757 switch (ContainerVT.getSimpleVT().SimpleTy) {
26758 default:
26759 llvm_unreachable("unimplemented container type");
26760 case MVT::nxv2i64:
26761 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26762 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26763 if (VT.getVectorElementType() == MVT::i32)
26764 break;
26765 [[fallthrough]];
26766 case MVT::nxv4i32:
26767 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26768 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26769 if (VT.getVectorElementType() == MVT::i16)
26770 break;
26771 [[fallthrough]];
26772 case MVT::nxv8i16:
26773 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26774 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26775 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26776 break;
26777 }
26778
26779 return convertFromScalableVector(DAG, VT, Val);
26780}
26781
26782SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26783 SDValue Op, SelectionDAG &DAG) const {
26784 EVT VT = Op.getValueType();
26785 EVT InVT = Op.getOperand(0).getValueType();
26786 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26787
26788 SDLoc DL(Op);
26789 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26790 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26791
26792 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26793}
26794
26795SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26796 SDValue Op, SelectionDAG &DAG) const {
26797 EVT VT = Op.getValueType();
26798 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26799
26800 SDLoc DL(Op);
26801 EVT InVT = Op.getOperand(0).getValueType();
26802 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26803 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26804
26805 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26806 Op.getOperand(1), Op.getOperand(2));
26807
26808 return convertFromScalableVector(DAG, VT, ScalableRes);
26809}
26810
26811// Convert vector operation 'Op' to an equivalent predicated operation whereby
26812// the original operation's type is used to construct a suitable predicate.
26813// NOTE: The results for inactive lanes are undefined.
26814SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26815 SelectionDAG &DAG,
26816 unsigned NewOp) const {
26817 EVT VT = Op.getValueType();
26818 SDLoc DL(Op);
26819 auto Pg = getPredicateForVector(DAG, DL, VT);
26820
26821 if (VT.isFixedLengthVector()) {
26822 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26823 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26824
26825 // Create list of operands by converting existing ones to scalable types.
26827 for (const SDValue &V : Op->op_values()) {
26828 if (isa<CondCodeSDNode>(V)) {
26829 Operands.push_back(V);
26830 continue;
26831 }
26832
26833 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26834 EVT VTArg = VTNode->getVT().getVectorElementType();
26835 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26836 Operands.push_back(DAG.getValueType(NewVTArg));
26837 continue;
26838 }
26839
26840 assert(isTypeLegal(V.getValueType()) &&
26841 "Expected only legal fixed-width types");
26842 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26843 }
26844
26845 if (isMergePassthruOpcode(NewOp))
26846 Operands.push_back(DAG.getUNDEF(ContainerVT));
26847
26848 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26849 return convertFromScalableVector(DAG, VT, ScalableRes);
26850 }
26851
26852 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26853
26855 for (const SDValue &V : Op->op_values()) {
26856 assert((!V.getValueType().isVector() ||
26857 V.getValueType().isScalableVector()) &&
26858 "Only scalable vectors are supported!");
26859 Operands.push_back(V);
26860 }
26861
26862 if (isMergePassthruOpcode(NewOp))
26863 Operands.push_back(DAG.getUNDEF(VT));
26864
26865 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26866}
26867
26868// If a fixed length vector operation has no side effects when applied to
26869// undefined elements, we can safely use scalable vectors to perform the same
26870// operation without needing to worry about predication.
26871SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26872 SelectionDAG &DAG) const {
26873 EVT VT = Op.getValueType();
26875 "Only expected to lower fixed length vector operation!");
26876 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26877
26878 // Create list of operands by converting existing ones to scalable types.
26880 for (const SDValue &V : Op->op_values()) {
26881 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26882
26883 // Pass through non-vector operands.
26884 if (!V.getValueType().isVector()) {
26885 Ops.push_back(V);
26886 continue;
26887 }
26888
26889 // "cast" fixed length vector to a scalable vector.
26890 assert(V.getValueType().isFixedLengthVector() &&
26891 isTypeLegal(V.getValueType()) &&
26892 "Only fixed length vectors are supported!");
26893 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26894 }
26895
26896 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26897 return convertFromScalableVector(DAG, VT, ScalableRes);
26898}
26899
26900SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26901 SelectionDAG &DAG) const {
26902 SDLoc DL(ScalarOp);
26903 SDValue AccOp = ScalarOp.getOperand(0);
26904 SDValue VecOp = ScalarOp.getOperand(1);
26905 EVT SrcVT = VecOp.getValueType();
26906 EVT ResVT = SrcVT.getVectorElementType();
26907
26908 EVT ContainerVT = SrcVT;
26909 if (SrcVT.isFixedLengthVector()) {
26910 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26911 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26912 }
26913
26914 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26915 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26916
26917 // Convert operands to Scalable.
26918 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26919 DAG.getUNDEF(ContainerVT), AccOp, Zero);
26920
26921 // Perform reduction.
26922 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26923 Pg, AccOp, VecOp);
26924
26925 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26926}
26927
26928SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26929 SelectionDAG &DAG) const {
26930 SDLoc DL(ReduceOp);
26931 SDValue Op = ReduceOp.getOperand(0);
26932 EVT OpVT = Op.getValueType();
26933 EVT VT = ReduceOp.getValueType();
26934
26935 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26936 return SDValue();
26937
26938 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26939
26940 switch (ReduceOp.getOpcode()) {
26941 default:
26942 return SDValue();
26943 case ISD::VECREDUCE_OR:
26944 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26945 // The predicate can be 'Op' because
26946 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26947 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26948 else
26949 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26950 case ISD::VECREDUCE_AND: {
26951 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26952 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26953 }
26954 case ISD::VECREDUCE_XOR: {
26955 SDValue ID =
26956 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26957 if (OpVT == MVT::nxv1i1) {
26958 // Emulate a CNTP on .Q using .D and a different governing predicate.
26959 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26960 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26961 }
26962 SDValue Cntp =
26963 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26964 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
26965 }
26966 }
26967
26968 return SDValue();
26969}
26970
26971SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26972 SDValue ScalarOp,
26973 SelectionDAG &DAG) const {
26974 SDLoc DL(ScalarOp);
26975 SDValue VecOp = ScalarOp.getOperand(0);
26976 EVT SrcVT = VecOp.getValueType();
26977
26979 SrcVT,
26980 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
26981 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26982 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26983 }
26984
26985 // UADDV always returns an i64 result.
26986 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26987 SrcVT.getVectorElementType();
26988 EVT RdxVT = SrcVT;
26989 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
26990 RdxVT = getPackedSVEVectorVT(ResVT);
26991
26992 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26993 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
26994 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26995 Rdx, DAG.getConstant(0, DL, MVT::i64));
26996
26997 // The VEC_REDUCE nodes expect an element size result.
26998 if (ResVT != ScalarOp.getValueType())
26999 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27000
27001 return Res;
27002}
27003
27004SDValue
27005AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27006 SelectionDAG &DAG) const {
27007 EVT VT = Op.getValueType();
27008 SDLoc DL(Op);
27009
27010 EVT InVT = Op.getOperand(1).getValueType();
27011 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27012 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27013 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27014
27015 // Convert the mask to a predicated (NOTE: We don't need to worry about
27016 // inactive lanes since VSELECT is safe when given undefined elements).
27017 EVT MaskVT = Op.getOperand(0).getValueType();
27018 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27019 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27021 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27022
27023 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27024 Mask, Op1, Op2);
27025
27026 return convertFromScalableVector(DAG, VT, ScalableRes);
27027}
27028
27029SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27030 SDValue Op, SelectionDAG &DAG) const {
27031 SDLoc DL(Op);
27032 EVT InVT = Op.getOperand(0).getValueType();
27033 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27034
27035 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27036 "Only expected to lower fixed length vector operation!");
27037 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27038 "Expected integer result of the same bit length as the inputs!");
27039
27040 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27041 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27042 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27043
27044 EVT CmpVT = Pg.getValueType();
27045 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27046 {Pg, Op1, Op2, Op.getOperand(2)});
27047
27048 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27049 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27050 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27051}
27052
27053SDValue
27054AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27055 SelectionDAG &DAG) const {
27056 SDLoc DL(Op);
27057 auto SrcOp = Op.getOperand(0);
27058 EVT VT = Op.getValueType();
27059 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27060 EVT ContainerSrcVT =
27061 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27062
27063 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27064 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27065 return convertFromScalableVector(DAG, VT, Op);
27066}
27067
27068SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27069 SDValue Op, SelectionDAG &DAG) const {
27070 SDLoc DL(Op);
27071 unsigned NumOperands = Op->getNumOperands();
27072
27073 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27074 "Unexpected number of operands in CONCAT_VECTORS");
27075
27076 auto SrcOp1 = Op.getOperand(0);
27077 auto SrcOp2 = Op.getOperand(1);
27078 EVT VT = Op.getValueType();
27079 EVT SrcVT = SrcOp1.getValueType();
27080
27081 if (NumOperands > 2) {
27083 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27084 for (unsigned I = 0; I < NumOperands; I += 2)
27085 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27086 Op->getOperand(I), Op->getOperand(I + 1)));
27087
27088 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27089 }
27090
27091 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27092
27094 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27095 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27096
27097 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27098
27099 return convertFromScalableVector(DAG, VT, Op);
27100}
27101
27102SDValue
27103AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27104 SelectionDAG &DAG) const {
27105 EVT VT = Op.getValueType();
27106 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27107
27108 SDLoc DL(Op);
27109 SDValue Val = Op.getOperand(0);
27110 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27111 EVT SrcVT = Val.getValueType();
27112 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27113 EVT ExtendVT = ContainerVT.changeVectorElementType(
27114 SrcVT.getVectorElementType());
27115
27116 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27117 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27118
27119 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27120 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27121 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27122 Pg, Val, DAG.getUNDEF(ContainerVT));
27123
27124 return convertFromScalableVector(DAG, VT, Val);
27125}
27126
27127SDValue
27128AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27129 SelectionDAG &DAG) const {
27130 EVT VT = Op.getValueType();
27131 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27132
27133 SDLoc DL(Op);
27134 SDValue Val = Op.getOperand(0);
27135 EVT SrcVT = Val.getValueType();
27136 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27137 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27139 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27140
27141 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27142 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27143 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27144 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27145 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27146
27147 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27148 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27149}
27150
27151SDValue
27152AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27153 SelectionDAG &DAG) const {
27154 EVT VT = Op.getValueType();
27155 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27156
27157 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27158 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27160
27161 SDLoc DL(Op);
27162 SDValue Val = Op.getOperand(0);
27163 EVT SrcVT = Val.getValueType();
27164 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27165 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27166
27167 if (VT.bitsGE(SrcVT)) {
27169
27170 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27171 VT.changeTypeToInteger(), Val);
27172
27173 // Safe to use a larger than specified operand because by promoting the
27174 // value nothing has changed from an arithmetic point of view.
27175 Val =
27176 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27177 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27178 DAG.getUNDEF(ContainerDstVT));
27179 return convertFromScalableVector(DAG, VT, Val);
27180 } else {
27181 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27182 ContainerDstVT.getVectorElementType());
27184
27185 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27186 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27187 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27188 Val = convertFromScalableVector(DAG, SrcVT, Val);
27189
27190 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27191 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27192 }
27193}
27194
27195SDValue
27196AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27197 SelectionDAG &DAG) const {
27198 SDLoc DL(Op);
27199 EVT OpVT = Op.getValueType();
27200 assert(OpVT.isScalableVector() &&
27201 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27202 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27203 Op.getOperand(1));
27204 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27205 Op.getOperand(1));
27206 return DAG.getMergeValues({Even, Odd}, DL);
27207}
27208
27209SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27210 SelectionDAG &DAG) const {
27211 SDLoc DL(Op);
27212 EVT OpVT = Op.getValueType();
27213 assert(OpVT.isScalableVector() &&
27214 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27215
27216 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27217 Op.getOperand(1));
27218 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27219 Op.getOperand(1));
27220 return DAG.getMergeValues({Lo, Hi}, DL);
27221}
27222
27223SDValue
27224AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27225 SelectionDAG &DAG) const {
27226 EVT VT = Op.getValueType();
27227 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27228
27229 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27230 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27232
27233 SDLoc DL(Op);
27234 SDValue Val = Op.getOperand(0);
27235 EVT SrcVT = Val.getValueType();
27236 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27237 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27238
27239 if (VT.bitsGT(SrcVT)) {
27240 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27241 ContainerSrcVT.getVectorElementType());
27243
27244 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27245 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27246
27247 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27248 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27249 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27250 DAG.getUNDEF(ContainerDstVT));
27251 return convertFromScalableVector(DAG, VT, Val);
27252 } else {
27253 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27255
27256 // Safe to use a larger than specified result since an fp_to_int where the
27257 // result doesn't fit into the destination is undefined.
27258 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27259 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27260 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27261
27262 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27263 }
27264}
27265
27267 ArrayRef<int> ShuffleMask, EVT VT,
27268 EVT ContainerVT, SelectionDAG &DAG) {
27269 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27270 SDLoc DL(Op);
27271 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27272 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27273 bool IsSingleOp =
27274 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27275
27276 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27277 MinSVESize = 128;
27278
27279 // Ignore two operands if no SVE2 or all index numbers couldn't
27280 // be represented.
27281 if (!IsSingleOp && !Subtarget.hasSVE2())
27282 return SDValue();
27283
27284 EVT VTOp1 = Op.getOperand(0).getValueType();
27285 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27286 unsigned IndexLen = MinSVESize / BitsPerElt;
27287 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27288 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27289 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27290 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27291 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27292 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27293 "Incorrectly legalised shuffle operation");
27294
27296 // If MinSVESize is not equal to MaxSVESize then we need to know which
27297 // TBL mask element needs adjustment.
27298 SmallVector<SDValue, 8> AddRuntimeVLMask;
27299
27300 // Bail out for 8-bits element types, because with 2048-bit SVE register
27301 // size 8 bits is only sufficient to index into the first source vector.
27302 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27303 return SDValue();
27304
27305 for (int Index : ShuffleMask) {
27306 // Handling poison index value.
27307 if (Index < 0)
27308 Index = 0;
27309 // If the mask refers to elements in the second operand, then we have to
27310 // offset the index by the number of elements in a vector. If this is number
27311 // is not known at compile-time, we need to maintain a mask with 'VL' values
27312 // to add at runtime.
27313 if ((unsigned)Index >= ElementsPerVectorReg) {
27314 if (MinMaxEqual) {
27315 Index += IndexLen - ElementsPerVectorReg;
27316 } else {
27317 Index = Index - ElementsPerVectorReg;
27318 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27319 }
27320 } else if (!MinMaxEqual)
27321 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27322 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27323 // to 255, this might point to the last element of in the second operand
27324 // of the shufflevector, thus we are rejecting this transform.
27325 if ((unsigned)Index >= MaxOffset)
27326 return SDValue();
27327 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27328 }
27329
27330 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27331 // value where it would perform first lane duplication for out of
27332 // index elements. For i8 elements an out-of-range index could be a valid
27333 // for 2048-bit vector register size.
27334 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27335 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27336 if (!MinMaxEqual)
27337 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27338 }
27339
27340 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27341 SDValue VecMask =
27342 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27343 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27344
27345 SDValue Shuffle;
27346 if (IsSingleOp)
27347 Shuffle =
27348 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27349 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27350 Op1, SVEMask);
27351 else if (Subtarget.hasSVE2()) {
27352 if (!MinMaxEqual) {
27353 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27354 SDValue VScale = (BitsPerElt == 64)
27355 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27356 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27357 SDValue VecMask =
27358 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27359 SDValue MulByMask = DAG.getNode(
27360 ISD::MUL, DL, MaskType,
27361 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27362 DAG.getBuildVector(MaskType, DL,
27363 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27364 SDValue UpdatedVecMask =
27365 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27366 SVEMask = convertToScalableVector(
27367 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27368 }
27369 Shuffle =
27370 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27371 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27372 Op1, Op2, SVEMask);
27373 }
27374 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27375 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27376}
27377
27378SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27379 SDValue Op, SelectionDAG &DAG) const {
27380 EVT VT = Op.getValueType();
27381 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27382
27383 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27384 auto ShuffleMask = SVN->getMask();
27385
27386 SDLoc DL(Op);
27387 SDValue Op1 = Op.getOperand(0);
27388 SDValue Op2 = Op.getOperand(1);
27389
27390 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27391 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27392 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27393
27394 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27395 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27396 return MVT::i32;
27397 return ScalarTy;
27398 };
27399
27400 if (SVN->isSplat()) {
27401 unsigned Lane = std::max(0, SVN->getSplatIndex());
27402 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27403 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27404 DAG.getConstant(Lane, DL, MVT::i64));
27405 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27406 return convertFromScalableVector(DAG, VT, Op);
27407 }
27408
27409 bool ReverseEXT = false;
27410 unsigned Imm;
27411 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27412 Imm == VT.getVectorNumElements() - 1) {
27413 if (ReverseEXT)
27414 std::swap(Op1, Op2);
27415 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27416 SDValue Scalar = DAG.getNode(
27417 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27418 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27419 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27420 return convertFromScalableVector(DAG, VT, Op);
27421 }
27422
27423 for (unsigned LaneSize : {64U, 32U, 16U}) {
27424 if (isREVMask(ShuffleMask, VT, LaneSize)) {
27425 EVT NewVT =
27427 unsigned RevOp;
27428 unsigned EltSz = VT.getScalarSizeInBits();
27429 if (EltSz == 8)
27431 else if (EltSz == 16)
27433 else
27435
27436 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27437 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27438 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27439 return convertFromScalableVector(DAG, VT, Op);
27440 }
27441 }
27442
27443 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
27444 isREVMask(ShuffleMask, VT, 128)) {
27445 if (!VT.isFloatingPoint())
27446 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27447
27449 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27450 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27451 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27452 return convertFromScalableVector(DAG, VT, Op);
27453 }
27454
27455 unsigned WhichResult;
27456 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27458 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27459
27460 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
27461 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27463 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27464 }
27465
27466 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27468 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27469
27470 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27471 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27473 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27474 }
27475
27476 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27477 // represents the same logical operation as performed by a ZIP instruction. In
27478 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27479 // equivalent to an AArch64 instruction. There's the extra component of
27480 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27481 // only operated on 64/128bit vector types that have a direct mapping to a
27482 // target register and so an exact mapping is implied.
27483 // However, when using SVE for fixed length vectors, most legal vector types
27484 // are actually sub-vectors of a larger SVE register. When mapping
27485 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27486 // how the mask's indices translate. Specifically, when the mapping requires
27487 // an exact meaning for a specific vector index (e.g. Index X is the last
27488 // vector element in the register) then such mappings are often only safe when
27489 // the exact SVE register size is know. The main exception to this is when
27490 // indices are logically relative to the first element of either
27491 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27492 // when converting from fixed-length to scalable vector types (i.e. the start
27493 // of a fixed length vector is always the start of a scalable vector).
27494 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27495 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27496 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27497 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27498 Op2.isUndef()) {
27499 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27500 return convertFromScalableVector(DAG, VT, Op);
27501 }
27502
27503 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27505 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27506
27507 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
27508 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27510 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27511 }
27512
27513 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27515 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27516
27517 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27518 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27520 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27521 }
27522 }
27523
27524 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27525 // unless NEON is not available and we can assume minimal SVE register size is
27526 // 128-bits.
27527 if (MinSVESize || !Subtarget->isNeonAvailable())
27528 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27529 DAG);
27530
27531 return SDValue();
27532}
27533
27534SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27535 SelectionDAG &DAG) const {
27536 SDLoc DL(Op);
27537 EVT InVT = Op.getValueType();
27538
27539 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27540 InVT.isScalableVector() && isTypeLegal(InVT) &&
27541 "Only expect to cast between legal scalable vector types!");
27542 assert(VT.getVectorElementType() != MVT::i1 &&
27543 InVT.getVectorElementType() != MVT::i1 &&
27544 "For predicate bitcasts, use getSVEPredicateBitCast");
27545
27546 if (InVT == VT)
27547 return Op;
27548
27550 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27551
27552 // Safe bitcasting between unpacked vector types of different element counts
27553 // is currently unsupported because the following is missing the necessary
27554 // work to ensure the result's elements live where they're supposed to within
27555 // an SVE register.
27556 // 01234567
27557 // e.g. nxv2i32 = XX??XX??
27558 // nxv4f16 = X?X?X?X?
27560 VT == PackedVT || InVT == PackedInVT) &&
27561 "Unexpected bitcast!");
27562
27563 // Pack input if required.
27564 if (InVT != PackedInVT)
27565 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27566
27567 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27568
27569 // Unpack result if required.
27570 if (VT != PackedVT)
27572
27573 return Op;
27574}
27575
27577 SDValue N) const {
27578 return ::isAllActivePredicate(DAG, N);
27579}
27580
27582 return ::getPromotedVTForPredicate(VT);
27583}
27584
27585bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27586 SDValue Op, const APInt &OriginalDemandedBits,
27587 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27588 unsigned Depth) const {
27589
27590 unsigned Opc = Op.getOpcode();
27591 switch (Opc) {
27592 case AArch64ISD::VSHL: {
27593 // Match (VSHL (VLSHR Val X) X)
27594 SDValue ShiftL = Op;
27595 SDValue ShiftR = Op->getOperand(0);
27596 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27597 return false;
27598
27599 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27600 return false;
27601
27602 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27603 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27604
27605 // Other cases can be handled as well, but this is not
27606 // implemented.
27607 if (ShiftRBits != ShiftLBits)
27608 return false;
27609
27610 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27611 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27612
27613 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27614 APInt UnusedBits = ~OriginalDemandedBits;
27615
27616 if ((ZeroBits & UnusedBits) != ZeroBits)
27617 return false;
27618
27619 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27620 // used - simplify to just Val.
27621 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27622 }
27624 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27625 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27626 if (!MaxSVEVectorSizeInBits)
27627 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27628 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27629 // The SVE count intrinsics don't support the multiplier immediate so we
27630 // don't have to account for that here. The value returned may be slightly
27631 // over the true required bits, as this is based on the "ALL" pattern. The
27632 // other patterns are also exposed by these intrinsics, but they all
27633 // return a value that's strictly less than "ALL".
27634 unsigned RequiredBits = llvm::bit_width(MaxElements);
27635 unsigned BitWidth = Known.Zero.getBitWidth();
27636 if (RequiredBits < BitWidth)
27637 Known.Zero.setHighBits(BitWidth - RequiredBits);
27638 return false;
27639 }
27640 }
27641 }
27642
27644 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27645}
27646
27647bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27648 return Op.getOpcode() == AArch64ISD::DUP ||
27649 Op.getOpcode() == AArch64ISD::MOVI ||
27650 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27651 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27653}
27654
27656 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27657 Subtarget->hasComplxNum();
27658}
27659
27662 auto *VTy = dyn_cast<VectorType>(Ty);
27663 if (!VTy)
27664 return false;
27665
27666 // If the vector is scalable, SVE is enabled, implying support for complex
27667 // numbers. Otherwise, we need to ensure complex number support is available
27668 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27669 return false;
27670
27671 auto *ScalarTy = VTy->getScalarType();
27672 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27673
27674 // We can only process vectors that have a bit size of 128 or higher (with an
27675 // additional 64 bits for Neon). Additionally, these vectors must have a
27676 // power-of-2 size, as we later split them into the smallest supported size
27677 // and merging them back together after applying complex operation.
27678 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27679 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27680 !llvm::isPowerOf2_32(VTyWidth))
27681 return false;
27682
27683 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27684 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27685 return 8 <= ScalarWidth && ScalarWidth <= 64;
27686 }
27687
27688 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27689 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27690}
27691
27694 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27695 Value *Accumulator) const {
27696 VectorType *Ty = cast<VectorType>(InputA->getType());
27697 bool IsScalable = Ty->isScalableTy();
27698 bool IsInt = Ty->getElementType()->isIntegerTy();
27699
27700 unsigned TyWidth =
27702
27703 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27704 "Vector type must be either 64 or a power of 2 that is at least 128");
27705
27706 if (TyWidth > 128) {
27707 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27708 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27709 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27710 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27711 auto *UpperSplitA =
27712 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27713 auto *UpperSplitB =
27714 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27715 Value *LowerSplitAcc = nullptr;
27716 Value *UpperSplitAcc = nullptr;
27717 if (Accumulator) {
27718 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27719 UpperSplitAcc =
27720 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27721 }
27722 auto *LowerSplitInt = createComplexDeinterleavingIR(
27723 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27724 auto *UpperSplitInt = createComplexDeinterleavingIR(
27725 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27726
27727 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27728 B.getInt64(0));
27729 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27730 }
27731
27732 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27733 if (Accumulator == nullptr)
27735
27736 if (IsScalable) {
27737 if (IsInt)
27738 return B.CreateIntrinsic(
27739 Intrinsic::aarch64_sve_cmla_x, Ty,
27740 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27741
27742 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27743 return B.CreateIntrinsic(
27744 Intrinsic::aarch64_sve_fcmla, Ty,
27745 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27746 }
27747
27748 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27749 Intrinsic::aarch64_neon_vcmla_rot90,
27750 Intrinsic::aarch64_neon_vcmla_rot180,
27751 Intrinsic::aarch64_neon_vcmla_rot270};
27752
27753
27754 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27755 {Accumulator, InputA, InputB});
27756 }
27757
27758 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27759 if (IsScalable) {
27762 if (IsInt)
27763 return B.CreateIntrinsic(
27764 Intrinsic::aarch64_sve_cadd_x, Ty,
27765 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27766
27767 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27768 return B.CreateIntrinsic(
27769 Intrinsic::aarch64_sve_fcadd, Ty,
27770 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27771 }
27772 return nullptr;
27773 }
27774
27777 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27779 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27780
27781 if (IntId == Intrinsic::not_intrinsic)
27782 return nullptr;
27783
27784 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27785 }
27786
27787 return nullptr;
27788}
27789
27790bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27791 unsigned Opc = N->getOpcode();
27792 if (ISD::isExtOpcode(Opc)) {
27793 if (any_of(N->uses(),
27794 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27795 return false;
27796 }
27797 return true;
27798}
27799
27800unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27801 return Subtarget->getMinimumJumpTableEntries();
27802}
27803
27806 EVT VT) const {
27807 bool NonUnitFixedLengthVector =
27809 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27811
27812 EVT VT1;
27813 MVT RegisterVT;
27814 unsigned NumIntermediates;
27815 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27816 RegisterVT);
27817 return RegisterVT;
27818}
27819
27821 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27822 bool NonUnitFixedLengthVector =
27824 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27826
27827 EVT VT1;
27828 MVT VT2;
27829 unsigned NumIntermediates;
27831 NumIntermediates, VT2);
27832}
27833
27835 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27836 unsigned &NumIntermediates, MVT &RegisterVT) const {
27838 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27839 if (!RegisterVT.isFixedLengthVector() ||
27840 RegisterVT.getFixedSizeInBits() <= 128)
27841 return NumRegs;
27842
27843 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27844 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27845 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27846
27847 // A size mismatch here implies either type promotion or widening and would
27848 // have resulted in scalarisation if larger vectors had not be available.
27849 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27850 EVT EltTy = VT.getVectorElementType();
27852 if (!isTypeLegal(NewVT))
27853 NewVT = EltTy;
27854
27855 IntermediateVT = NewVT;
27856 NumIntermediates = VT.getVectorNumElements();
27857 RegisterVT = getRegisterType(Context, NewVT);
27858 return NumIntermediates;
27859 }
27860
27861 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27862 // types for vector arguments and returns.
27863
27864 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27865 NumIntermediates *= NumSubRegs;
27866 NumRegs *= NumSubRegs;
27867
27868 switch (RegisterVT.getVectorElementType().SimpleTy) {
27869 default:
27870 llvm_unreachable("unexpected element type for vector");
27871 case MVT::i8:
27872 IntermediateVT = RegisterVT = MVT::v16i8;
27873 break;
27874 case MVT::i16:
27875 IntermediateVT = RegisterVT = MVT::v8i16;
27876 break;
27877 case MVT::i32:
27878 IntermediateVT = RegisterVT = MVT::v4i32;
27879 break;
27880 case MVT::i64:
27881 IntermediateVT = RegisterVT = MVT::v2i64;
27882 break;
27883 case MVT::f16:
27884 IntermediateVT = RegisterVT = MVT::v8f16;
27885 break;
27886 case MVT::f32:
27887 IntermediateVT = RegisterVT = MVT::v4f32;
27888 break;
27889 case MVT::f64:
27890 IntermediateVT = RegisterVT = MVT::v2f64;
27891 break;
27892 case MVT::bf16:
27893 IntermediateVT = RegisterVT = MVT::v8bf16;
27894 break;
27895 }
27896
27897 return NumRegs;
27898}
27899
27901 const MachineFunction &MF) const {
27902 return !Subtarget->isTargetWindows() &&
27903 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27904}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsLess, bool IsEqual)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static const unsigned PerfectShuffleTable[6561+1]
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1896
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1934
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1941
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:205
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1654
unsigned arg_size() const
Definition: InstrTypes.h:1652
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1838
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:680
bool empty() const
Definition: Function.h:804
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:200
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:677
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1882
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:338
arg_iterator arg_end()
Definition: Function.h:822
arg_iterator arg_begin()
Definition: Function.h:813
size_t size() const
Definition: Function.h:803
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:350
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:677
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:528
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2115
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2001
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1031
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2450
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1875
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2501
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1039
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:533
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2148
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2494
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:460
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2051
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2100
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1431
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:470
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2065
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:485
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2105
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1410
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2005
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2472
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2095
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1491
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:563
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2390
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1860
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:510
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2644
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:80
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:84
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:690
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
void addCallSiteInfo(const SDNode *Node, CallSiteInfoImpl &&CallInfo)
Set CallSiteInfo to be associated with Node.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:466
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:680
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:651
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:618
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1126
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1122
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:476
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1339
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1370
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1155
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1241
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:436
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1355
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1359
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1369
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:477
@ STRICT_FLOG2
Definition: ISDOpcodes.h:421
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1267
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1268
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:411
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1400
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:885
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:662
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:450
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1352
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1221
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1356
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:988
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:758
@ STRICT_LROUND
Definition: ISDOpcodes.h:431
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1077
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:327
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1052
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1056
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:586
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:646
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ STRICT_FPOWI
Definition: ISDOpcodes.h:413
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1237
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1371
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:627
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1151
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1364
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1266
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1265
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:435
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:424
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1211
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:856
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:425
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1329
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1248
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1215
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1041
@ STRICT_LRINT
Definition: ISDOpcodes.h:433
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:591
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ STRICT_FROUND
Definition: ISDOpcodes.h:428
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:449
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1372
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1263
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1264
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1182
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1208
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:657
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:400
@ STRICT_FLOG10
Definition: ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:434
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:612
@ STRICT_FEXP2
Definition: ISDOpcodes.h:418
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1262
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:432
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:855
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1360
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1146
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1070
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:580
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:313
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1594
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1485
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1472
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1503
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1474
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1459
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:765
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:821
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:163
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:862
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:293
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:228
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:332
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:269
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1439
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:319
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:246
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:233
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2060
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2048
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:422
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:364
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:279
Structure used to represent pair of argument number after call lowering and register used to transfer...
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64